@article{Abnar_Zuidema_2020,
	title        = {Quantifying Attention Flow in Transformers},
	author       = {Abnar, Samira and Zuidema, Willem},
	year         = 2020,
	month        = may,
	publisher    = {arXiv},
	number       = {arXiv:2005.00928},
	url          = {http://arxiv.org/abs/2005.00928},
	note         = {arXiv:2005.00928 [cs]}
}
@article{Alain_Bengio_2018,
	title        = {Understanding intermediate layers using linear classifier probes},
	author       = {Alain, Guillaume and Bengio, Yoshua},
	year         = 2018,
	month        = nov,
	publisher    = {arXiv},
	number       = {arXiv:1610.01644},
	url          = {http://arxiv.org/abs/1610.01644},
	note         = {arXiv:1610.01644 [cs, stat]}
}
@article{Alvarez-Melis_Jaakkola_2018,
	title        = {Towards Robust Interpretability with Self-Explaining Neural Networks},
	author       = {Alvarez-Melis, David and Jaakkola, Tommi S.},
	year         = 2018,
	month        = dec,
	publisher    = {arXiv},
	number       = {arXiv:1806.07538},
	url          = {http://arxiv.org/abs/1806.07538},
	note         = {arXiv:1806.07538 [cs, stat]}
}
@inproceedings{Amershi_Chickering_Drucker_Lee_Simard_Suh_2015,
	title        = {ModelTracker: Redesigning Performance Analysis Tools for Machine Learning},
	author       = {Amershi, Saleema and Chickering, Max and Drucker, Steven M. and Lee, Bongshin and Simard, Patrice and Suh, Jina},
	year         = 2015,
	month        = apr,
	booktitle    = {Proceedings of the 33rd Annual ACM Conference on Human Factors in Computing Systems},
	publisher    = {ACM},
	address      = {Seoul Republic of Korea},
	pages        = {337–346},
	doi          = {10.1145/2702123.2702509},
	isbn         = {978-1-4503-3145-6},
	url          = {https://dl.acm.org/doi/10.1145/2702123.2702509},
	language     = {en}
}
@article{Amodei_Olah_Steinhardt_Christiano_Schulman_Mané_2016,
	title        = {Concrete Problems in AI Safety},
	author       = {Amodei, Dario and Olah, Chris and Steinhardt, Jacob and Christiano, Paul and Schulman, John and Mané, Dan},
	year         = 2016,
	month        = jul,
	publisher    = {arXiv},
	number       = {arXiv:1606.06565},
	url          = {http://arxiv.org/abs/1606.06565},
	note         = {arXiv:1606.06565 [cs]}
}
@article{Andoni_Indyk_Laarhoven_Razenshteyn_Schmidt_2015,
	title        = {Practical and Optimal LSH for Angular Distance},
	author       = {Andoni, Alexandr and Indyk, Piotr and Laarhoven, Thijs and Razenshteyn, Ilya and Schmidt, Ludwig},
	year         = 2015,
	month        = sep,
	publisher    = {arXiv},
	number       = {arXiv:1509.02897},
	url          = {http://arxiv.org/abs/1509.02897},
	note         = {arXiv:1509.02897 [cs]}
}
@article{Araci_2019,
	title        = {FinBERT: Financial Sentiment Analysis with Pre-trained Language Models},
	author       = {Araci, Dogu},
	year         = 2019,
	month        = aug,
	publisher    = {arXiv},
	number       = {arXiv:1908.10063},
	url          = {http://arxiv.org/abs/1908.10063},
	note         = {arXiv:1908.10063 [cs]}
}
@article{Arkhangelskaia_Dutta_2019,
	title        = {Whatcha lookin’ at? DeepLIFTing BERT’s Attention in Question Answering},
	author       = {Arkhangelskaia, Ekaterina and Dutta, Sourav},
	year         = 2019,
	month        = oct,
	publisher    = {arXiv},
	number       = {arXiv:1910.06431},
	url          = {http://arxiv.org/abs/1910.06431},
	note         = {arXiv:1910.06431 [cs]}
}
@inproceedings{Artetxe_Ruder_Yogatama_2020,
	title        = {On the Cross-lingual Transferability of Monolingual Representations},
	author       = {Artetxe, Mikel and Ruder, Sebastian and Yogatama, Dani},
	year         = 2020,
	booktitle    = {Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics},
	pages        = {4623–4637},
	doi          = {10.18653/v1/2020.acl-main.421},
	url          = {http://arxiv.org/abs/1910.11856},
	note         = {arXiv:1910.11856 [cs]}
}
@article{Artetxe_Schwenk_2019,
	title        = {Massively Multilingual Sentence Embeddings for Zero-Shot Cross-Lingual Transfer and Beyond},
	author       = {Artetxe, Mikel and Schwenk, Holger},
	year         = 2019,
	month        = nov,
	journal      = {Transactions of the Association for Computational Linguistics},
	volume       = 7,
	pages        = {597–610},
	doi          = {10.1162/tacl_a_00288},
	issn         = {2307-387X},
	language     = {en}
}
@book{Azunre_2021,
	title        = {Transfer Learning for Natural Language Processing},
	author       = {Azunre, Paul},
	year         = 2021,
	month        = aug,
	publisher    = {Simon and Schuster},
	isbn         = {978-1-61729-726-7},
	note         = {Google-Books-ID: bGI7EAAAQBAJ},
	language     = {en}
}
@article{Ba_Kiros_Hinton_2016,
	title        = {Layer Normalization},
	author       = {Ba, Jimmy Lei and Kiros, Jamie Ryan and Hinton, Geoffrey E.},
	year         = 2016,
	month        = jul,
	publisher    = {arXiv},
	number       = {arXiv:1607.06450},
	url          = {http://arxiv.org/abs/1607.06450},
	note         = {arXiv:1607.06450 [cs, stat]}
}
@article{Bach_Binder_Montavon_Klauschen_Müller_Samek_2015,
	title        = {On Pixel-Wise Explanations for Non-Linear Classifier Decisions by Layer-Wise Relevance Propagation},
	author       = {Bach, Sebastian and Binder, Alexander and Montavon, Grégoire and Klauschen, Frederick and Müller, Klaus-Robert and Samek, Wojciech},
	year         = 2015,
	month        = jul,
	journal      = {PLOS ONE},
	volume       = 10,
	number       = 7,
	pages        = {e0130140},
	doi          = {10.1371/journal.pone.0130140},
	issn         = {1932-6203},
	editor       = {Suarez, Oscar Deniz},
	language     = {en}
}
@article{Baehrens_Schroeter_Harmeling_Kawanabe_Hansen,
	title        = {How to Explain Individual Classiﬁcation Decisions},
	author       = {Baehrens, David and Schroeter, Timon and Harmeling, Stefan and Kawanabe, Motoaki and Hansen, Katja},
	language     = {en}
}
@article{Baevski_Zhou_Mohamed_Auli_2020,
	title        = {wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations},
	author       = {Baevski, Alexei and Zhou, Henry and Mohamed, Abdelrahman and Auli, Michael},
	year         = 2020,
	month        = oct,
	publisher    = {arXiv},
	number       = {arXiv:2006.11477},
	url          = {http://arxiv.org/abs/2006.11477},
	note         = {arXiv:2006.11477 [cs, eess]}
}
@article{Bahdanau_Cho_Bengio_2016,
	title        = {Neural Machine Translation by Jointly Learning to Align and Translate},
	author       = {Bahdanau, Dzmitry and Cho, Kyunghyun and Bengio, Yoshua},
	year         = 2016,
	month        = may,
	publisher    = {arXiv},
	number       = {arXiv:1409.0473},
	url          = {http://arxiv.org/abs/1409.0473},
	note         = {arXiv:1409.0473 [cs, stat]}
}
@article{Bapna_Arivazhagan_Firat_2020,
	title        = {Controlling Computation versus Quality for Neural Sequence Models},
	author       = {Bapna, Ankur and Arivazhagan, Naveen and Firat, Orhan},
	year         = 2020,
	month        = apr,
	publisher    = {arXiv},
	number       = {arXiv:2002.07106},
	url          = {http://arxiv.org/abs/2002.07106},
	note         = {arXiv:2002.07106 [cs, stat]}
}
@article{Bapna_Chen_Firat_Cao_Wu_2018,
	title        = {Training Deeper Neural Machine Translation Models with Transparent Attention},
	author       = {Bapna, Ankur and Chen, Mia Xu and Firat, Orhan and Cao, Yuan and Wu, Yonghui},
	year         = 2018,
	month        = sep,
	publisher    = {arXiv},
	number       = {arXiv:1808.07561},
	doi          = {10.48550/arXiv.1808.07561},
	url          = {http://arxiv.org/abs/1808.07561},
	note         = {arXiv:1808.07561 [cs]}
}
@article{Barocas_Boyd_2017,
	title        = {Engaging the ethics of data science in practice},
	author       = {Barocas, Solon and Boyd, Danah},
	year         = 2017,
	month        = oct,
	journal      = {Communications of the ACM},
	volume       = 60,
	number       = 11,
	pages        = {23–25},
	doi          = {10.1145/3144172},
	issn         = {0001-0782, 1557-7317},
	language     = {en}
}
@article{Bastani_Kim_Bastani_2019,
	title        = {Interpreting Blackbox Models via Model Extraction},
	author       = {Bastani, Osbert and Kim, Carolyn and Bastani, Hamsa},
	year         = 2019,
	month        = jan,
	publisher    = {arXiv},
	number       = {arXiv:1705.08504},
	url          = {http://arxiv.org/abs/1705.08504},
	note         = {arXiv:1705.08504 [cs]}
}
@inproceedings{Baum_Köhl_Schmidt_2017,
	title        = {Two Challenges for CI Trustworthiness and How to Address Them},
	author       = {Baum, Kevin and Köhl, Maximilian A. and Schmidt, Eva},
	year         = 2017,
	booktitle    = {Proceedings of the 1st Workshop on Explainable Computational           Intelligence (XCI 2017)},
	publisher    = {Association for Computational Linguistics},
	address      = {Dundee, United Kingdom},
	doi          = {10.18653/v1/W17-3701},
	url          = {http://aclweb.org/anthology/W17-3701},
	language     = {en}
}
@article{Belinkov_2021,
	title        = {Probing Classifiers: Promises, Shortcomings, and Advances},
	author       = {Belinkov, Yonatan},
	year         = 2021,
	month        = sep,
	publisher    = {arXiv},
	number       = {arXiv:2102.12452},
	url          = {http://arxiv.org/abs/2102.12452},
	note         = {arXiv:2102.12452 [cs]}
}
@article{Beltagy_Lo_Cohan_2019,
	title        = {SciBERT: A Pretrained Language Model for Scientific Text},
	author       = {Beltagy, Iz and Lo, Kyle and Cohan, Arman},
	year         = 2019,
	month        = sep,
	publisher    = {arXiv},
	number       = {arXiv:1903.10676},
	doi          = {10.48550/arXiv.1903.10676},
	url          = {http://arxiv.org/abs/1903.10676},
	note         = {arXiv:1903.10676 [cs]}
}
@article{Beltagy_Peters_Cohan_2020,
	title        = {Longformer: The Long-Document Transformer},
	author       = {Beltagy, Iz and Peters, Matthew E. and Cohan, Arman},
	year         = 2020,
	month        = dec,
	publisher    = {arXiv},
	number       = {arXiv:2004.05150},
	url          = {http://arxiv.org/abs/2004.05150},
	note         = {arXiv:2004.05150 [cs]}
}
@inproceedings{Bender_Hovy_Schofield_2020,
	title        = {Integrating Ethics into the NLP Curriculum},
	author       = {Bender, Emily M. and Hovy, Dirk and Schofield, Alexandra},
	year         = 2020,
	booktitle    = {Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics: Tutorial Abstracts},
	publisher    = {Association for Computational Linguistics},
	address      = {Online},
	pages        = {6–9},
	doi          = {10.18653/v1/2020.acl-tutorials.2},
	url          = {https://www.aclweb.org/anthology/2020.acl-tutorials.2},
	language     = {en}
}
@inproceedings{Bengio_Lamblin_Popovici_Larochelle_2006,
	title        = {Greedy Layer-Wise Training of Deep Networks},
	author       = {Bengio, Yoshua and Lamblin, Pascal and Popovici, Dan and Larochelle, Hugo},
	year         = 2006,
	booktitle    = {Advances in Neural Information Processing Systems},
	publisher    = {MIT Press},
	volume       = 19,
	url          = {https://proceedings.neurips.cc/paper/2006/hash/5da713a690c067105aeb2fae32403405-Abstract.html}
}
@inbook{Bengio_LeCun_2007,
	title        = {Scaling Learning Algorithms toward AI},
	author       = {Bengio, Yoshua and LeCun, Yann},
	year         = 2007,
	month        = aug,
	booktitle    = {Large-Scale Kernel Machines},
	publisher    = {The MIT Press},
	pages        = {321–360},
	doi          = {10.7551/mitpress/7496.003.0016},
	isbn         = {978-0-262-25579-0},
	url          = {https://direct.mit.edu/books/book/3172/chapter/88105/Scaling-Learning-Algorithms-toward-AI},
	editor       = {Bottou, Léon and Chapelle, Olivier and DeCoste, Dennis and Weston, Jason},
	language     = {en}
}
@article{Bertasius_Wang_Torresani_2021,
	title        = {Is Space-Time Attention All You Need for Video Understanding?},
	author       = {Bertasius, Gedas and Wang, Heng and Torresani, Lorenzo},
	year         = 2021,
	month        = jun,
	publisher    = {arXiv},
	number       = {arXiv:2102.05095},
	url          = {http://arxiv.org/abs/2102.05095},
	note         = {arXiv:2102.05095 [cs]}
}
@article{Beutel_Chen_Zhao_Chi_2017,
	title        = {Data Decisions and Theoretical Implications when Adversarially Learning Fair Representations},
	author       = {Beutel, Alex and Chen, Jilin and Zhao, Zhe and Chi, Ed H.},
	year         = 2017,
	month        = jul,
	publisher    = {arXiv},
	number       = {arXiv:1707.00075},
	url          = {http://arxiv.org/abs/1707.00075},
	note         = {arXiv:1707.00075 [cs]}
}
@inbook{Bollobás_1998,
	title        = {Random Graphs},
	author       = {Bollobás, Béla},
	year         = 1998,
	booktitle    = {Modern Graph Theory},
	publisher    = {Springer},
	address      = {New York, NY},
	pages        = {215–252},
	doi          = {10.1007/978-1-4612-0619-4_7},
	isbn         = {978-1-4612-0619-4},
	url          = {https://doi.org/10.1007/978-1-4612-0619-4_7},
	editor       = {Bollobás, Béla},
	language     = {en}
}
@article{Bowman_Vilnis_Vinyals_Dai_Jozefowicz_Bengio_2016,
	title        = {Generating Sentences from a Continuous Space},
	author       = {Bowman, Samuel R. and Vilnis, Luke and Vinyals, Oriol and Dai, Andrew M. and Jozefowicz, Rafal and Bengio, Samy},
	year         = 2016,
	month        = may,
	publisher    = {arXiv},
	number       = {arXiv:1511.06349},
	url          = {http://arxiv.org/abs/1511.06349},
	note         = {arXiv:1511.06349 [cs]}
}
@article{Brown_Mann_Ryder_Subbiah_Kaplan_Dhariwal_Neelakantan_Shyam_Sastry_Askell_etal._2020,
	title        = {Language Models are Few-Shot Learners},
	author       = {Brown, Tom B. and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel M. and Wu, Jeffrey and Winter, Clemens and Hesse, Christopher and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},
	year         = 2020,
	month        = jul,
	publisher    = {arXiv},
	number       = {arXiv:2005.14165},
	url          = {http://arxiv.org/abs/2005.14165},
	note         = {arXiv:2005.14165 [cs]}
}
@book{Cage,
	title        = {Python Transformers By Huggingface Hands On: 101 practical implementation hands-on of ALBERT/ViT/BigBird and other latest models with huggingface transformers},
	author       = {Cage, Joshua K.}
}
@inproceedings{Calders_Kamiran_Pechenizkiy_2009,
	title        = {Building Classifiers with Independency Constraints},
	author       = {Calders, Toon and Kamiran, Faisal and Pechenizkiy, Mykola},
	year         = 2009,
	month        = dec,
	booktitle    = {2009 IEEE International Conference on Data Mining Workshops},
	publisher    = {IEEE},
	address      = {Miami, FL, USA},
	pages        = {13–18},
	doi          = {10.1109/ICDMW.2009.83},
	isbn         = {978-1-4244-5384-9},
	url          = {http://ieeexplore.ieee.org/document/5360534/}
}
@inproceedings{Calmon_Wei_Vinzamuri_Natesan_Ramamurthy_Varshney_2017,
	title        = {Optimized Pre-Processing for Discrimination Prevention},
	author       = {Calmon, Flavio and Wei, Dennis and Vinzamuri, Bhanukiran and Natesan Ramamurthy, Karthikeyan and Varshney, Kush R},
	year         = 2017,
	booktitle    = {Advances in Neural Information Processing Systems},
	publisher    = {Curran Associates, Inc.},
	volume       = 30,
	url          = {https://proceedings.neurips.cc/paper_files/paper/2017/hash/9a49a25d845a483fae4be7e341368e36-Abstract.html}
}
@article{Camburu_Rocktäschel_Lukasiewicz_Blunsom_2018,
	title        = {e-SNLI: Natural Language Inference with Natural Language Explanations},
	author       = {Camburu, Oana-Maria and Rocktäschel, Tim and Lukasiewicz, Thomas and Blunsom, Phil},
	year         = 2018,
	month        = dec,
	publisher    = {arXiv},
	number       = {arXiv:1812.01193},
	url          = {http://arxiv.org/abs/1812.01193},
	note         = {arXiv:1812.01193 [cs]}
}
@inproceedings{Cer_Diab_Agirre_Lopez-Gazpio_Specia_2017,
	title        = {SemEval-2017 Task 1: Semantic Textual Similarity - Multilingual and Cross-lingual Focused Evaluation},
	author       = {Cer, Daniel and Diab, Mona and Agirre, Eneko and Lopez-Gazpio, Iñigo and Specia, Lucia},
	year         = 2017,
	booktitle    = {Proceedings of the 11th International Workshop on Semantic Evaluation           (SemEval-2017)},
	pages        = {1–14},
	doi          = {10.18653/v1/S17-2001},
	url          = {http://arxiv.org/abs/1708.00055},
	note         = {arXiv:1708.00055 [cs]}
}
@inproceedings{Charikar_2002,
	title        = {Similarity estimation techniques from rounding algorithms},
	author       = {Charikar, Moses S.},
	year         = 2002,
	booktitle    = {Proceedings of the thiry-fourth annual ACM symposium on Theory of computing},
	publisher    = {Association for Computing Machinery},
	address      = {New York, NY, USA},
	series       = {STOC ’02},
	pages        = {380–388},
	doi          = {10.1145/509907.509965},
	isbn         = {978-1-58113-495-7},
	url          = {https://doi.org/10.1145/509907.509965},
	collection   = {STOC ’02}
}
@article{Chen_Li_Tao_Barnett_Su_Rudin_2019,
	title        = {This Looks Like That: Deep Learning for Interpretable Image Recognition},
	author       = {Chen, Chaofan and Li, Oscar and Tao, Chaofan and Barnett, Alina Jade and Su, Jonathan and Rudin, Cynthia},
	year         = 2019,
	month        = dec,
	publisher    = {arXiv},
	number       = {arXiv:1806.10574},
	url          = {http://arxiv.org/abs/1806.10574},
	note         = {arXiv:1806.10574 [cs, stat]}
}
@article{Chen_Lu_Rajeswaran_Lee_Grover_Laskin_Abbeel_Srinivas_Mordatch_2021,
	title        = {Decision Transformer: Reinforcement Learning via Sequence Modeling},
	author       = {Chen, Lili and Lu, Kevin and Rajeswaran, Aravind and Lee, Kimin and Grover, Aditya and Laskin, Michael and Abbeel, Pieter and Srinivas, Aravind and Mordatch, Igor},
	year         = 2021,
	month        = jun,
	publisher    = {arXiv},
	number       = {arXiv:2106.01345},
	url          = {http://arxiv.org/abs/2106.01345},
	note         = {arXiv:2106.01345 [cs]}
}
@article{Chi_Hewitt_Manning_2020,
	title        = {Finding Universal Grammatical Relations in Multilingual BERT},
	author       = {Chi, Ethan A. and Hewitt, John and Manning, Christopher D.},
	year         = 2020,
	month        = may,
	publisher    = {arXiv},
	number       = {arXiv:2005.04511},
	doi          = {10.48550/arXiv.2005.04511},
	url          = {http://arxiv.org/abs/2005.04511},
	note         = {arXiv:2005.04511 [cs]}
}
@article{Chi_Dong_Wei_Mao_Huang_2019,
	title        = {Can Monolingual Pretrained Models Help Cross-Lingual Classification?},
	author       = {Chi, Zewen and Dong, Li and Wei, Furu and Mao, Xian-Ling and Huang, Heyan},
	year         = 2019,
	month        = nov,
	publisher    = {arXiv},
	number       = {arXiv:1911.03913},
	doi          = {10.48550/arXiv.1911.03913},
	url          = {http://arxiv.org/abs/1911.03913},
	note         = {arXiv:1911.03913 [cs]}
}
@article{Chi_Dong_Wei_Wang_Mao_Huang_2020,
	title        = {Cross-Lingual Natural Language Generation via Pre-Training},
	author       = {Chi, Zewen and Dong, Li and Wei, Furu and Wang, Wenhui and Mao, Xian-Ling and Huang, Heyan},
	year         = 2020,
	month        = apr,
	journal      = {Proceedings of the AAAI Conference on Artificial Intelligence},
	volume       = 34,
	number       = {0505},
	pages        = {7570–7577},
	doi          = {10.1609/aaai.v34i05.6256},
	issn         = {2374-3468},
	rights       = {Copyright (c) 2020 Association for the Advancement of Artificial Intelligence},
	language     = {en}
}
@article{Chi_Dong_Wei_Yang_Singhal_Wang_Song_Mao_Huang_Zhou_2021,
	title        = {InfoXLM: An Information-Theoretic Framework for Cross-Lingual Language Model Pre-Training},
	author       = {Chi, Zewen and Dong, Li and Wei, Furu and Yang, Nan and Singhal, Saksham and Wang, Wenhui and Song, Xia and Mao, Xian-Ling and Huang, Heyan and Zhou, Ming},
	year         = 2021,
	month        = apr,
	publisher    = {arXiv},
	number       = {arXiv:2007.07834},
	url          = {http://arxiv.org/abs/2007.07834},
	note         = {arXiv:2007.07834 [cs]}
}
@article{Chidambaram_Yang_Cer_Yuan_Sung_Strope_Kurzweil_2019,
	title        = {Learning Cross-Lingual Sentence Representations via a Multi-task Dual-Encoder Model},
	author       = {Chidambaram, Muthuraman and Yang, Yinfei and Cer, Daniel and Yuan, Steve and Sung, Yun-Hsuan and Strope, Brian and Kurzweil, Ray},
	year         = 2019,
	month        = aug,
	publisher    = {arXiv},
	number       = {arXiv:1810.12836},
	url          = {http://arxiv.org/abs/1810.12836},
	note         = {arXiv:1810.12836 [cs]}
}
@article{Child_Gray_Radford_Sutskever_2019,
	title        = {Generating Long Sequences with Sparse Transformers},
	author       = {Child, Rewon and Gray, Scott and Radford, Alec and Sutskever, Ilya},
	year         = 2019,
	month        = apr,
	publisher    = {arXiv},
	number       = {arXiv:1904.10509},
	url          = {http://arxiv.org/abs/1904.10509},
	note         = {arXiv:1904.10509 [cs, stat]}
}
@article{Cho_van_Merrienboer_Gulcehre_Bahdanau_Bougares_Schwenk_Bengio_2014,
	title        = {Learning Phrase Representations using RNN Encoder-Decoder for Statistical Machine Translation},
	author       = {Cho, Kyunghyun and van Merrienboer, Bart and Gulcehre, Caglar and Bahdanau, Dzmitry and Bougares, Fethi and Schwenk, Holger and Bengio, Yoshua},
	year         = 2014,
	month        = sep,
	publisher    = {arXiv},
	number       = {arXiv:1406.1078},
	url          = {http://arxiv.org/abs/1406.1078},
	note         = {arXiv:1406.1078 [cs, stat]}
}
@article{Choenni_Shutova_2020,
	title        = {What does it mean to be language-agnostic? Probing multilingual sentence encoders for typological properties},
	author       = {Choenni, Rochelle and Shutova, Ekaterina},
	year         = 2020,
	month        = sep,
	publisher    = {arXiv},
	number       = {arXiv:2009.12862},
	url          = {http://arxiv.org/abs/2009.12862},
	note         = {arXiv:2009.12862 [cs]}
}
@article{Choromanski_Likhosherstov_Dohan_Song_Gane_Sarlos_Hawkins_Davis_Mohiuddin_Kaiser_etal._2022,
	title        = {Rethinking Attention with Performers},
	author       = {Choromanski, Krzysztof and Likhosherstov, Valerii and Dohan, David and Song, Xingyou and Gane, Andreea and Sarlos, Tamas and Hawkins, Peter and Davis, Jared and Mohiuddin, Afroz and Kaiser, Lukasz and Belanger, David and Colwell, Lucy and Weller, Adrian},
	year         = 2022,
	month        = nov,
	publisher    = {arXiv},
	number       = {arXiv:2009.14794},
	url          = {http://arxiv.org/abs/2009.14794},
	note         = {arXiv:2009.14794 [cs, stat]}
}
@article{Chung_Févry_Tsai_Johnson_Ruder_2020,
	title        = {Rethinking embedding coupling in pre-trained language models},
	author       = {Chung, Hyung Won and Févry, Thibault and Tsai, Henry and Johnson, Melvin and Ruder, Sebastian},
	year         = 2020,
	month        = oct,
	publisher    = {arXiv},
	number       = {arXiv:2010.12821},
	url          = {http://arxiv.org/abs/2010.12821},
	note         = {arXiv:2010.12821 [cs]}
}
@article{Chung_Garrette_Tan_Riesa_2020,
	title        = {Improving Multilingual Models with Language-Clustered Vocabularies},
	author       = {Chung, Hyung Won and Garrette, Dan and Tan, Kiat Chuan and Riesa, Jason},
	year         = 2020,
	month        = oct,
	publisher    = {arXiv},
	number       = {arXiv:2010.12777},
	url          = {http://arxiv.org/abs/2010.12777},
	note         = {arXiv:2010.12777 [cs]}
}
@article{Clark_Choi_Collins_Garrette_Kwiatkowski_Nikolaev_Palomaki_2020,
	title        = {T y D i QA: A Benchmark for Information-Seeking Question Answering in Ty pologically Di verse Languages},
	author       = {Clark, Jonathan H. and Choi, Eunsol and Collins, Michael and Garrette, Dan and Kwiatkowski, Tom and Nikolaev, Vitaly and Palomaki, Jennimaria},
	year         = 2020,
	month        = dec,
	journal      = {Transactions of the Association for Computational Linguistics},
	volume       = 8,
	pages        = {454–470},
	doi          = {10.1162/tacl_a_00317},
	issn         = {2307-387X},
	language     = {en}
}
@article{Clark_Khandelwal_Levy_Manning_2019,
	title        = {What Does BERT Look At? An Analysis of BERT’s Attention},
	author       = {Clark, Kevin and Khandelwal, Urvashi and Levy, Omer and Manning, Christopher D.},
	year         = 2019,
	month        = jun,
	publisher    = {arXiv},
	number       = {arXiv:1906.04341},
	url          = {http://arxiv.org/abs/1906.04341},
	note         = {arXiv:1906.04341 [cs]}
}
@article{Clark_Luong_Le_Manning_2020,
	title        = {ELECTRA: Pre-training Text Encoders as Discriminators Rather Than Generators},
	author       = {Clark, Kevin and Luong, Minh-Thang and Le, Quoc V. and Manning, Christopher D.},
	year         = 2020,
	month        = mar,
	publisher    = {arXiv},
	number       = {arXiv:2003.10555},
	url          = {http://arxiv.org/abs/2003.10555},
	note         = {arXiv:2003.10555 [cs]}
}
@book{Coeckelbergh_2020,
	title        = {AI Ethics},
	author       = {Coeckelbergh, Mark},
	year         = 2020,
	month        = apr,
	publisher    = {The MIT Press},
	doi          = {10.7551/mitpress/12549.001.0001},
	isbn         = {978-0-262-35706-7},
	url          = {https://direct.mit.edu/books/book/4612/AI-Ethics},
	language     = {en}
}
@inproceedings{Collobert_Weston_2008,
	title        = {A unified architecture for natural language processing: deep neural networks with multitask learning},
	author       = {Collobert, Ronan and Weston, Jason},
	year         = 2008,
	booktitle    = {Proceedings of the 25th international conference on Machine learning - ICML ’08},
	publisher    = {ACM Press},
	address      = {Helsinki, Finland},
	pages        = {160–167},
	doi          = {10.1145/1390156.1390177},
	isbn         = {978-1-60558-205-4},
	url          = {http://portal.acm.org/citation.cfm?doid=1390156.1390177},
	language     = {en}
}
@article{Conneau_Kruszewski_Lample_Barrault_Baroni_2018,
	title        = {What you can cram into a single vector: Probing sentence embeddings for linguistic properties},
	author       = {Conneau, Alexis and Kruszewski, German and Lample, Guillaume and Barrault, Loïc and Baroni, Marco},
	year         = 2018,
	month        = jul,
	publisher    = {arXiv},
	number       = {arXiv:1805.01070},
	url          = {http://arxiv.org/abs/1805.01070},
	note         = {arXiv:1805.01070 [cs]}
}
@inproceedings{CONNEAU_Lample_2019,
	title        = {Cross-lingual Language Model Pretraining},
	author       = {CONNEAU, Alexis and Lample, Guillaume},
	year         = 2019,
	booktitle    = {Advances in Neural Information Processing Systems},
	publisher    = {Curran Associates, Inc.},
	volume       = 32,
	url          = {https://proceedings.neurips.cc/paper_files/paper/2019/hash/c04c19c2c2474dbf5f7ac4372c5b9af1-Abstract.html}
}
@article{Conneau_Lample_Rinott_Williams_Bowman_Schwenk_Stoyanov_2018,
	title        = {XNLI: Evaluating Cross-lingual Sentence Representations},
	author       = {Conneau, Alexis and Lample, Guillaume and Rinott, Ruty and Williams, Adina and Bowman, Samuel R. and Schwenk, Holger and Stoyanov, Veselin},
	year         = 2018,
	month        = sep,
	publisher    = {arXiv},
	number       = {arXiv:1809.05053},
	url          = {http://arxiv.org/abs/1809.05053},
	note         = {arXiv:1809.05053 [cs]}
}
@article{Cordonnier_Loukas_Jaggi_2021,
	title        = {Multi-Head Attention: Collaborate Instead of Concatenate},
	author       = {Cordonnier, Jean-Baptiste and Loukas, Andreas and Jaggi, Martin},
	year         = 2021,
	month        = may,
	publisher    = {arXiv},
	number       = {arXiv:2006.16362},
	url          = {http://arxiv.org/abs/2006.16362},
	note         = {arXiv:2006.16362 [cs, stat]}
}
@article{Dai_Lai_Yang_Le_2020,
	title        = {Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing},
	author       = {Dai, Zihang and Lai, Guokun and Yang, Yiming and Le, Quoc V.},
	year         = 2020,
	month        = jun,
	publisher    = {arXiv},
	number       = {arXiv:2006.03236},
	url          = {http://arxiv.org/abs/2006.03236},
	note         = {arXiv:2006.03236 [cs, stat]}
}
@article{Dai_Yang_Yang_Carbonell_Le_Salakhutdinov_2019,
	title        = {Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context},
	author       = {Dai, Zihang and Yang, Zhilin and Yang, Yiming and Carbonell, Jaime and Le, Quoc V. and Salakhutdinov, Ruslan},
	year         = 2019,
	month        = jun,
	publisher    = {arXiv},
	number       = {arXiv:1901.02860},
	url          = {http://arxiv.org/abs/1901.02860},
	note         = {arXiv:1901.02860 [cs, stat]}
}
@article{Dehghani_Gouws_Vinyals_Uszkoreit_Kaiser_2019,
	title        = {Universal Transformers},
	author       = {Dehghani, Mostafa and Gouws, Stephan and Vinyals, Oriol and Uszkoreit, Jakob and Kaiser, Łukasz},
	year         = 2019,
	month        = mar,
	publisher    = {arXiv},
	number       = {arXiv:1807.03819},
	url          = {http://arxiv.org/abs/1807.03819},
	note         = {arXiv:1807.03819 [cs, stat]}
}
@inproceedings{Deng_Dong_Socher_Li_Kai_Li_Li_Fei-Fei_2009,
	title        = {ImageNet: A large-scale hierarchical image database},
	author       = {Deng, Jia and Dong, Wei and Socher, Richard and Li, Li-Jia and Kai Li and Li Fei-Fei},
	year         = 2009,
	month        = jun,
	booktitle    = {2009 IEEE Conference on Computer Vision and Pattern Recognition},
	publisher    = {IEEE},
	address      = {Miami, FL},
	pages        = {248–255},
	doi          = {10.1109/CVPR.2009.5206848},
	isbn         = {978-1-4244-3992-8},
	url          = {https://ieeexplore.ieee.org/document/5206848/}
}
@article{Devlin_Chang_Lee_Toutanova_2019,
	title        = {BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding},
	author       = {Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
	year         = 2019,
	month        = may,
	publisher    = {arXiv},
	number       = {arXiv:1810.04805},
	url          = {http://arxiv.org/abs/1810.04805},
	note         = {arXiv:1810.04805 [cs]}
}
@inproceedings{Ding_Liu_Luan_Sun_2017,
	title        = {Visualizing and Understanding Neural Machine Translation},
	author       = {Ding, Yanzhuo and Liu, Yang and Luan, Huanbo and Sun, Maosong},
	year         = 2017,
	booktitle    = {Proceedings of the 55th Annual Meeting of the Association for           Computational Linguistics (Volume 1: Long Papers)},
	publisher    = {Association for Computational Linguistics},
	address      = {Vancouver, Canada},
	pages        = {1150–1159},
	doi          = {10.18653/v1/P17-1106},
	url          = {http://aclweb.org/anthology/P17-1106},
	language     = {en}
}
@article{Doddapaneni_Ramesh_Khapra_Kunchukuttan_Kumar_2021,
	title        = {A Primer on Pretrained Multilingual Language Models},
	author       = {Doddapaneni, Sumanth and Ramesh, Gowtham and Khapra, Mitesh M. and Kunchukuttan, Anoop and Kumar, Pratyush},
	year         = 2021,
	month        = dec,
	publisher    = {arXiv},
	number       = {arXiv:2107.00676},
	url          = {http://arxiv.org/abs/2107.00676},
	note         = {arXiv:2107.00676 [cs]}
}
@inproceedings{Dolan_Brockett_2005,
	title        = {Automatically Constructing a Corpus of Sentential Paraphrases},
	author       = {Dolan, Bill and Brockett, Chris},
	year         = 2005,
	month        = jan,
	url          = {https://www.microsoft.com/en-us/research/publication/automatically-constructing-a-corpus-of-sentential-paraphrases/},
	language     = {en-US}
}
@inproceedings{Dong_Liao_Pang_Su_Zhu_Hu_Li_2018,
	title        = {Boosting Adversarial Attacks With Momentum},
	author       = {Dong, Yinpeng and Liao, Fangzhou and Pang, Tianyu and Su, Hang and Zhu, Jun and Hu, Xiaolin and Li, Jianguo},
	year         = 2018,
	pages        = {9185–9193},
	url          = {https://openaccess.thecvf.com/content_cvpr_2018/html/Dong_Boosting_Adversarial_Attacks_CVPR_2018_paper.html}
}
@inproceedings{Dong_Su_Zhu_Zhang_2017,
	title        = {Improving Interpretability of Deep Neural Networks With Semantic Information},
	author       = {Dong, Yinpeng and Su, Hang and Zhu, Jun and Zhang, Bo},
	year         = 2017,
	pages        = {4306–4314},
	url          = {https://openaccess.thecvf.com/content_cvpr_2017/html/Dong_Improving_Interpretability_of_CVPR_2017_paper.html}
}
@article{Dosovitskiy_Beyer_Kolesnikov_Weissenborn_Zhai_Unterthiner_Dehghani_Minderer_Heigold_Gelly_etal._2021,
	title        = {An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale},
	author       = {Dosovitskiy, Alexey and Beyer, Lucas and Kolesnikov, Alexander and Weissenborn, Dirk and Zhai, Xiaohua and Unterthiner, Thomas and Dehghani, Mostafa and Minderer, Matthias and Heigold, Georg and Gelly, Sylvain and Uszkoreit, Jakob and Houlsby, Neil},
	year         = 2021,
	month        = jun,
	publisher    = {arXiv},
	number       = {arXiv:2010.11929},
	url          = {http://arxiv.org/abs/2010.11929},
	note         = {arXiv:2010.11929 [cs]}
}
@inproceedings{Dufter_Schütze_2020,
	title        = {Identifying Elements Essential for BERT’s Multilinguality},
	author       = {Dufter, Philipp and Schütze, Hinrich},
	year         = 2020,
	booktitle    = {Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)},
	publisher    = {Association for Computational Linguistics},
	address      = {Online},
	pages        = {4423–4437},
	doi          = {10.18653/v1/2020.emnlp-main.358},
	url          = {https://www.aclweb.org/anthology/2020.emnlp-main.358},
	language     = {en}
}
@article{Dwivedi_Bresson_2021,
	title        = {A Generalization of Transformer Networks to Graphs},
	author       = {Dwivedi, Vijay Prakash and Bresson, Xavier},
	year         = 2021,
	month        = jan,
	publisher    = {arXiv},
	number       = {arXiv:2012.09699},
	url          = {http://arxiv.org/abs/2012.09699},
	note         = {arXiv:2012.09699 [cs]}
}
@article{Dwivedi_Joshi_Luu_Laurent_Bengio_Bresson_2022,
	title        = {Benchmarking Graph Neural Networks},
	author       = {Dwivedi, Vijay Prakash and Joshi, Chaitanya K. and Luu, Anh Tuan and Laurent, Thomas and Bengio, Yoshua and Bresson, Xavier},
	year         = 2022,
	month        = dec,
	publisher    = {arXiv},
	number       = {arXiv:2003.00982},
	url          = {http://arxiv.org/abs/2003.00982},
	note         = {arXiv:2003.00982 [cs, stat]}
}
@inproceedings{Dwork_Immorlica_Kalai_Leiserson_2018,
	title        = {Decoupled Classifiers for Group-Fair and Efficient Machine Learning},
	author       = {Dwork, Cynthia and Immorlica, Nicole and Kalai, Adam Tauman and Leiserson, Max},
	year         = 2018,
	month        = jan,
	booktitle    = {Proceedings of the 1st Conference on Fairness, Accountability and Transparency},
	publisher    = {PMLR},
	pages        = {119–133},
	issn         = {2640-3498},
	url          = {https://proceedings.mlr.press/v81/dwork18a.html},
	language     = {en}
}
@article{Eckart_Young_1936,
	title        = {The approximation of one matrix by another of lower rank},
	author       = {Eckart, Carl and Young, Gale},
	year         = 1936,
	month        = sep,
	journal      = {Psychometrika},
	volume       = 1,
	number       = 3,
	pages        = {211–218},
	doi          = {10.1007/BF02288367},
	issn         = {1860-0980},
	language     = {en}
}
@article{Erhan_Bengio_Courville_Vincent_2009,
	title        = {Visualizing Higher-Layer Features of a Deep Network},
	author       = {Erhan, Dumitru and Bengio, Y. and Courville, Aaron and Vincent, Pascal},
	year         = 2009,
	month        = jan,
	journal      = {Technical Report, Univeristé de Montréal}
}
@inproceedings{Eykholt_Evtimov_Fernandes_Li_Rahmati_Xiao_Prakash_Kohno_Song_2018,
	title        = {Robust Physical-World Attacks on Deep Learning Visual Classification},
	author       = {Eykholt, Kevin and Evtimov, Ivan and Fernandes, Earlence and Li, Bo and Rahmati, Amir and Xiao, Chaowei and Prakash, Atul and Kohno, Tadayoshi and Song, Dawn},
	year         = 2018,
	pages        = {1625–1634},
	url          = {https://openaccess.thecvf.com/content_cvpr_2018/html/Eykholt_Robust_Physical-World_Attacks_CVPR_2018_paper}
}
@article{Fedus_Zoph_Shazeer_2022,
	title        = {Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity},
	author       = {Fedus, William and Zoph, Barret and Shazeer, Noam},
	year         = 2022,
	month        = jun,
	publisher    = {arXiv},
	number       = {arXiv:2101.03961},
	url          = {http://arxiv.org/abs/2101.03961},
	note         = {arXiv:2101.03961 [cs]}
}
@article{Feng_Yang_Cer_Arivazhagan_Wang_2022,
	title        = {Language-agnostic BERT Sentence Embedding},
	author       = {Feng, Fangxiaoyu and Yang, Yinfei and Cer, Daniel and Arivazhagan, Naveen and Wang, Wei},
	year         = 2022,
	month        = mar,
	publisher    = {arXiv},
	number       = {arXiv:2007.01852},
	url          = {http://arxiv.org/abs/2007.01852},
	note         = {arXiv:2007.01852 [cs]}
}
@inproceedings{Fong_Vedaldi_2017,
	title        = {Interpretable Explanations of Black Boxes by Meaningful Perturbation},
	author       = {Fong, Ruth and Vedaldi, Andrea},
	year         = 2017,
	month        = oct,
	booktitle    = {2017 IEEE International Conference on Computer Vision (ICCV)},
	pages        = {3449–3457},
	doi          = {10.1109/ICCV.2017.371},
	url          = {http://arxiv.org/abs/1704.03296},
	note         = {arXiv:1704.03296 [cs, stat]}
}
@article{Gage_1994,
	title        = {A new algorithm for data compression},
	author       = {Gage, Philip},
	year         = 1994,
	journal      = {The C Users Journal},
	volume       = 12,
	number       = 2,
	pages        = {23–38},
	issn         = {0898-9788}
}
@inproceedings{Gomez_Ren_Urtasun_Grosse_2017,
	title        = {The Reversible Residual Network: Backpropagation Without Storing Activations},
	author       = {Gomez, Aidan N and Ren, Mengye and Urtasun, Raquel and Grosse, Roger B},
	year         = 2017,
	booktitle    = {Advances in Neural Information Processing Systems},
	publisher    = {Curran Associates, Inc.},
	volume       = 30,
	url          = {https://proceedings.neurips.cc/paper/2017/hash/f9be311e65d81a9ad8150a60844bb94c-Abstract.html}
}
@article{Goodfellow_Shlens_Szegedy_2015,
	title        = {Explaining and Harnessing Adversarial Examples},
	author       = {Goodfellow, Ian J. and Shlens, Jonathon and Szegedy, Christian},
	year         = 2015,
	month        = mar,
	publisher    = {arXiv},
	number       = {arXiv:1412.6572},
	url          = {http://arxiv.org/abs/1412.6572},
	note         = {arXiv:1412.6572 [cs, stat]}
}
@inproceedings{Gordaliza_Barrio_Fabrice_Loubes_2019,
	title        = {Obtaining Fairness using Optimal Transport Theory},
	author       = {Gordaliza, Paula and Barrio, Eustasio Del and Fabrice, Gamboa and Loubes, Jean-Michel},
	year         = 2019,
	month        = may,
	booktitle    = {Proceedings of the 36th International Conference on Machine Learning},
	publisher    = {PMLR},
	pages        = {2357–2365},
	issn         = {2640-3498},
	url          = {https://proceedings.mlr.press/v97/gordaliza19a.html},
	language     = {en}
}
@article{Graves_2014,
	title        = {Generating Sequences With Recurrent Neural Networks},
	author       = {Graves, Alex},
	year         = 2014,
	month        = jun,
	publisher    = {arXiv},
	number       = {arXiv:1308.0850},
	url          = {http://arxiv.org/abs/1308.0850},
	note         = {arXiv:1308.0850 [cs]}
}
@article{Graves_Wayne_Danihelka_2014,
	title        = {Neural Turing Machines},
	author       = {Graves, Alex and Wayne, Greg and Danihelka, Ivo},
	year         = 2014,
	month        = dec,
	publisher    = {arXiv},
	number       = {arXiv:1410.5401},
	url          = {http://arxiv.org/abs/1410.5401},
	note         = {arXiv:1410.5401 [cs]}
}
@article{Gray_Neuhoff_1998,
	title        = {Quantization},
	author       = {Gray, R.M. and Neuhoff, D.L.},
	year         = 1998,
	month        = oct,
	journal      = {IEEE Transactions on Information Theory},
	volume       = 44,
	number       = 6,
	pages        = {2325–2383},
	doi          = {10.1109/18.720541},
	issn         = {00189448},
	rights       = {https://ieeexplore.ieee.org/Xplorehelp/downloads/license-information/IEEE.html}
}
@article{Guo_Zhang_Liu_2019,
	title        = {Gaussian Transformer: A Lightweight Approach for Natural Language Inference},
	author       = {Guo, Maosheng and Zhang, Yu and Liu, Ting},
	year         = 2019,
	month        = jul,
	journal      = {Proceedings of the AAAI Conference on Artificial Intelligence},
	volume       = 33,
	number       = {0101},
	pages        = {6489–6496},
	doi          = {10.1609/aaai.v33i01.33016489},
	issn         = {2374-3468},
	rights       = {Copyright (c) 2019 Association for the Advancement of Artificial Intelligence},
	language     = {en}
}
@article{Guo_Qiu_Liu_Xue_Zhang_2020,
	title        = {Multi-Scale Self-Attention for Text Classification},
	author       = {Guo, Qipeng and Qiu, Xipeng and Liu, Pengfei and Xue, Xiangyang and Zhang, Zheng},
	year         = 2020,
	month        = apr,
	journal      = {Proceedings of the AAAI Conference on Artificial Intelligence},
	volume       = 34,
	number       = {0505},
	pages        = {7847–7854},
	doi          = {10.1609/aaai.v34i05.6290},
	issn         = {2374-3468},
	rights       = {Copyright (c) 2020 Association for the Advancement of Artificial Intelligence},
	language     = {en}
}
@inproceedings{He_Fan_Wu_Xie_Girshick_2020,
	title        = {Momentum Contrast for Unsupervised Visual Representation Learning},
	author       = {He, Kaiming and Fan, Haoqi and Wu, Yuxin and Xie, Saining and Girshick, Ross},
	year         = 2020,
	pages        = {9729–9738},
	url          = {https://openaccess.thecvf.com/content_CVPR_2020/html/He_Momentum_Contrast_for_Unsupervised_Visual_Representation_Learning_CVPR_2020_paper.html}
}
@inproceedings{He_Zhang_Ren_Sun_2016,
	title        = {Deep Residual Learning for Image Recognition},
	author       = {He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
	year         = 2016,
	pages        = {770–778},
	url          = {https://openaccess.thecvf.com/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html}
}
@article{He_Ravula_Kanagal_Ainslie_2021,
	title        = {RealFormer: Transformer Likes Residual Attention},
	author       = {He, Ruining and Ravula, Anirudh and Kanagal, Bhargav and Ainslie, Joshua},
	year         = 2021,
	month        = sep,
	publisher    = {arXiv},
	number       = {arXiv:2012.11747},
	url          = {http://arxiv.org/abs/2012.11747},
	note         = {arXiv:2012.11747 [cs]}
}
@book{Hebb_2005,
	title        = {The Organization of Behavior},
	author       = {Hebb, D.O.},
	year         = 2005,
	month        = apr,
	publisher    = {Psychology Press},
	doi          = {10.4324/9781410612403},
	isbn         = {978-1-4106-1240-3},
	url          = {https://www.taylorfrancis.com/books/9781135631918},
	edition      = {0},
	language     = {en}
}
@book{Hendricks_Akata_Rohrbach_Donahue_Schiele_Darrell_2016,
	title        = {Generating Visual Explanations},
	author       = {Hendricks, Lisa and Akata, Zeynep and Rohrbach, Marcus and Donahue, Jeff and Schiele, Bernt and Darrell, Trevor},
	year         = 2016,
	month        = oct,
	volume       = 9908,
	doi          = {10.1007/978-3-319-46493-0_1},
	isbn         = {978-3-319-46492-3}
}
@article{Hewitt_Liang_2019,
	title        = {Designing and Interpreting Probes with Control Tasks},
	author       = {Hewitt, John and Liang, Percy},
	year         = 2019,
	month        = sep,
	publisher    = {arXiv},
	number       = {arXiv:1909.03368},
	url          = {http://arxiv.org/abs/1909.03368},
	note         = {arXiv:1909.03368 [cs]}
}
@inproceedings{Hind_Wei_Campbell_Codella_Dhurandhar_Mojsilović_Natesan_Ramamurthy_Varshney_2019,
	title        = {TED: Teaching AI to Explain its Decisions},
	author       = {Hind, Michael and Wei, Dennis and Campbell, Murray and Codella, Noel C. F. and Dhurandhar, Amit and Mojsilović, Aleksandra and Natesan Ramamurthy, Karthikeyan and Varshney, Kush R.},
	year         = 2019,
	booktitle    = {Proceedings of the 2019 AAAI/ACM Conference on AI, Ethics, and Society},
	publisher    = {Association for Computing Machinery},
	address      = {New York, NY, USA},
	series       = {AIES ’19},
	pages        = {123–129},
	doi          = {10.1145/3306618.3314273},
	isbn         = {978-1-4503-6324-2},
	url          = {https://dl.acm.org/doi/10.1145/3306618.3314273},
	collection   = {AIES ’19}
}
@article{Hinton_Vinyals_Dean_2015,
	title        = {Distilling the Knowledge in a Neural Network},
	author       = {Hinton, Geoffrey and Vinyals, Oriol and Dean, Jeff},
	year         = 2015,
	month        = mar,
	publisher    = {arXiv},
	number       = {arXiv:1503.02531},
	url          = {http://arxiv.org/abs/1503.02531},
	note         = {arXiv:1503.02531 [cs, stat]}
}
@article{Hinton_Osindero_Teh_2006,
	title        = {A Fast Learning Algorithm for Deep Belief Nets},
	author       = {Hinton, Geoffrey E. and Osindero, Simon and Teh, Yee-Whye},
	year         = 2006,
	month        = jul,
	journal      = {Neural Computation},
	volume       = 18,
	number       = 7,
	pages        = {1527–1554},
	doi          = {10.1162/neco.2006.18.7.1527},
	issn         = {0899-7667}
}
@article{Ho_Kalchbrenner_Weissenborn_Salimans_2019,
	title        = {Axial Attention in Multidimensional Transformers},
	author       = {Ho, Jonathan and Kalchbrenner, Nal and Weissenborn, Dirk and Salimans, Tim},
	year         = 2019,
	month        = dec,
	publisher    = {arXiv},
	number       = {arXiv:1912.12180},
	url          = {http://arxiv.org/abs/1912.12180},
	note         = {arXiv:1912.12180 [cs]}
}
@article{Hochreiter_1998,
	title        = {The Vanishing Gradient Problem During Learning Recurrent Neural Nets and Problem Solutions},
	author       = {Hochreiter, Sepp},
	year         = 1998,
	month        = apr,
	journal      = {International Journal of Uncertainty, Fuzziness and Knowledge-Based Systems},
	volume       = {06},
	number       = {02},
	pages        = {107–116},
	doi          = {10.1142/S0218488598000094},
	issn         = {0218-4885, 1793-6411},
	language     = {en}
}
@article{Hochreiter_Schmidhuber_1997,
	title        = {Long Short-Term Memory},
	author       = {Hochreiter, Sepp and Schmidhuber, Jürgen},
	year         = 1997,
	month        = nov,
	journal      = {Neural Computation},
	volume       = 9,
	number       = 8,
	pages        = {1735–1780},
	doi          = {10.1162/neco.1997.9.8.1735},
	issn         = {0899-7667, 1530-888X},
	language     = {en}
}
@article{Hopfield_1982,
	title        = {Neural networks and physical systems with emergent collective computational abilities.},
	author       = {Hopfield, J J},
	year         = 1982,
	month        = apr,
	journal      = {Proceedings of the National Academy of Sciences},
	volume       = 79,
	number       = 8,
	pages        = {2554–2558},
	doi          = {10.1073/pnas.79.8.2554},
	issn         = {0027-8424, 1091-6490},
	language     = {en}
}
@article{Hou_Zhou_2020,
	title        = {Learning with Interpretable Structure from Gated RNN},
	author       = {Hou, Bo-Jian and Zhou, Zhi-Hua},
	year         = 2020,
	month        = jan,
	publisher    = {arXiv},
	number       = {arXiv:1810.10708},
	url          = {http://arxiv.org/abs/1810.10708},
	note         = {arXiv:1810.10708 [cs]}
}
@article{Howard_Ruder_2018,
	title        = {Universal Language Model Fine-tuning for Text Classification},
	author       = {Howard, Jeremy and Ruder, Sebastian},
	year         = 2018,
	month        = may,
	publisher    = {arXiv},
	number       = {arXiv:1801.06146},
	url          = {http://arxiv.org/abs/1801.06146},
	note         = {arXiv:1801.06146 [cs, stat]}
}
@article{Hsu_Bolte_Tsai_Lakhotia_Salakhutdinov_Mohamed_2021,
	title        = {HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units},
	author       = {Hsu, Wei-Ning and Bolte, Benjamin and Tsai, Yao-Hung Hubert and Lakhotia, Kushal and Salakhutdinov, Ruslan and Mohamed, Abdelrahman},
	year         = 2021,
	month        = jun,
	publisher    = {arXiv},
	number       = {arXiv:2106.07447},
	url          = {http://arxiv.org/abs/2106.07447},
	note         = {arXiv:2106.07447 [cs, eess]}
}
@article{Hu_Johnson_Firat_Siddhant_Neubig_2021,
	title        = {Explicit Alignment Objectives for Multilingual Bidirectional Encoders},
	author       = {Hu, Junjie and Johnson, Melvin and Firat, Orhan and Siddhant, Aditya and Neubig, Graham},
	year         = 2021,
	month        = apr,
	publisher    = {arXiv},
	number       = {arXiv:2010.07972},
	url          = {http://arxiv.org/abs/2010.07972},
	note         = {arXiv:2010.07972 [cs]}
}
@inproceedings{Hu_Ruder_Siddhant_Neubig_Firat_Johnson_2020,
	title        = {XTREME: A Massively Multilingual Multi-task Benchmark for Evaluating Cross-lingual Generalisation},
	author       = {Hu, Junjie and Ruder, Sebastian and Siddhant, Aditya and Neubig, Graham and Firat, Orhan and Johnson, Melvin},
	year         = 2020,
	month        = nov,
	booktitle    = {Proceedings of the 37th International Conference on Machine Learning},
	publisher    = {PMLR},
	pages        = {4411–4421},
	issn         = {2640-3498},
	url          = {https://proceedings.mlr.press/v119/hu20b.html},
	language     = {en}
}
@article{Hu_Singh_2021,
	title        = {UniT: Multimodal Multitask Learning with a Unified Transformer},
	author       = {Hu, Ronghang and Singh, Amanpreet},
	year         = 2021,
	month        = aug,
	publisher    = {arXiv},
	number       = {arXiv:2102.10772},
	url          = {http://arxiv.org/abs/2102.10772},
	note         = {arXiv:2102.10772 [cs]}
}
@article{Huang_Vaswani_Uszkoreit_Shazeer_Simon_Hawthorne_Dai_Hoffman_Dinculescu_Eck_2018,
	title        = {Music Transformer},
	author       = {Huang, Cheng-Zhi Anna and Vaswani, Ashish and Uszkoreit, Jakob and Shazeer, Noam and Simon, Ian and Hawthorne, Curtis and Dai, Andrew M. and Hoffman, Matthew D. and Dinculescu, Monica and Eck, Douglas},
	year         = 2018,
	month        = dec,
	publisher    = {arXiv},
	number       = {arXiv:1809.04281},
	url          = {http://arxiv.org/abs/1809.04281},
	note         = {arXiv:1809.04281 [cs, eess, stat]}
}
@article{Huang_Liang_Duan_Gong_Shou_Jiang_Zhou_2019,
	title        = {Unicoder: A Universal Language Encoder by Pre-training with Multiple Cross-lingual Tasks},
	author       = {Huang, Haoyang and Liang, Yaobo and Duan, Nan and Gong, Ming and Shou, Linjun and Jiang, Daxin and Zhou, Ming},
	year         = 2019,
	month        = sep,
	publisher    = {arXiv},
	number       = {arXiv:1909.00964},
	url          = {http://arxiv.org/abs/1909.00964},
	note         = {arXiv:1909.00964 [cs]}
}
@inproceedings{Huang_Wang_Huang_Huang_Wei_Liu_2019,
	title        = {CCNet: Criss-Cross Attention for Semantic Segmentation},
	author       = {Huang, Zilong and Wang, Xinggang and Huang, Lichao and Huang, Chang and Wei, Yunchao and Liu, Wenyu},
	year         = 2019,
	pages        = {603–612},
	url          = {https://openaccess.thecvf.com/content_ICCV_2019/html/Huang_CCNet_Criss-Cross_Attention_for_Semantic_Segmentation_ICCV_2019_paper.html}
}
@article{Hudson_Manning_2018,
	title        = {Compositional Attention Networks for Machine Reasoning},
	author       = {Hudson, Drew A. and Manning, Christopher D.},
	year         = 2018,
	month        = apr,
	publisher    = {arXiv},
	number       = {arXiv:1803.03067},
	url          = {http://arxiv.org/abs/1803.03067},
	note         = {arXiv:1803.03067 [cs]}
}
@article{Hupkes_Veldhoen_Zuidema_2018,
	title        = {Visualisation and “Diagnostic Classifiers” Reveal How Recurrent and Recursive Neural Networks Process Hierarchical Structure},
	author       = {Hupkes, Dieuwke and Veldhoen, Sara and Zuidema, Willem},
	year         = 2018,
	month        = apr,
	journal      = {Journal of Artificial Intelligence Research},
	volume       = 61,
	pages        = {907–926},
	doi          = {10.1613/jair.1.11196},
	issn         = {1076-9757}
}
@inproceedings{Indyk_Motwani_1998,
	title        = {Approximate nearest neighbors: towards removing the curse of dimensionality},
	author       = {Indyk, Piotr and Motwani, Rajeev},
	year         = 1998,
	booktitle    = {Proceedings of the thirtieth annual ACM symposium on Theory of computing  - STOC ’98},
	publisher    = {ACM Press},
	address      = {Dallas, Texas, United States},
	pages        = {604–613},
	doi          = {10.1145/276698.276876},
	isbn         = {978-0-89791-962-3},
	url          = {http://portal.acm.org/citation.cfm?doid=276698.276876},
	language     = {en}
}
@article{Ioffe_Szegedy_2015,
	title        = {Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift},
	author       = {Ioffe, Sergey and Szegedy, Christian},
	year         = 2015,
	month        = mar,
	publisher    = {arXiv},
	number       = {arXiv:1502.03167},
	url          = {http://arxiv.org/abs/1502.03167},
	note         = {arXiv:1502.03167 [cs]}
}
@inproceedings{Iyer_Li_Li_Lewis_Sundar_Sycara_2018,
	title        = {Transparency and Explanation in Deep Reinforcement Learning Neural Networks},
	author       = {Iyer, Rahul and Li, Yuezhang and Li, Huao and Lewis, Michael and Sundar, Ramitha and Sycara, Katia},
	year         = 2018,
	booktitle    = {Proceedings of the 2018 AAAI/ACM Conference on AI, Ethics, and Society},
	publisher    = {Association for Computing Machinery},
	address      = {New York, NY, USA},
	series       = {AIES ’18},
	pages        = {144–150},
	doi          = {10.1145/3278721.3278776},
	isbn         = {978-1-4503-6012-8},
	url          = {https://dl.acm.org/doi/10.1145/3278721.3278776},
	collection   = {AIES ’18}
}
@article{Jain_Wallace_2019,
	title        = {Attention is not Explanation},
	author       = {Jain, Sarthak and Wallace, Byron C.},
	year         = 2019,
	month        = may,
	publisher    = {arXiv},
	number       = {arXiv:1902.10186},
	url          = {http://arxiv.org/abs/1902.10186},
	note         = {arXiv:1902.10186 [cs]}
}
@inproceedings{Jiang_Kim_Guan_Gupta_2018,
	title        = {To Trust Or Not To Trust A Classifier},
	author       = {Jiang, Heinrich and Kim, Been and Guan, Melody and Gupta, Maya},
	year         = 2018,
	booktitle    = {Advances in Neural Information Processing Systems},
	publisher    = {Curran Associates, Inc.},
	volume       = 31,
	url          = {https://proceedings.neurips.cc/paper/2018/hash/7180cffd6a8e829dacfc2a31b3f72ece-Abstract.html}
}
@article{K_Wang_Mayhew_Roth_2020,
	title        = {Cross-Lingual Ability of Multilingual BERT: An Empirical Study},
	author       = {K, Karthikeyan and Wang, Zihan and Mayhew, Stephen and Roth, Dan},
	year         = 2020,
	month        = feb,
	publisher    = {arXiv},
	number       = {arXiv:1912.07840},
	url          = {http://arxiv.org/abs/1912.07840},
	note         = {arXiv:1912.07840 [cs]}
}
@article{Kaiser_Sutskever_2016,
	title        = {Neural GPUs Learn Algorithms},
	author       = {Kaiser, Łukasz and Sutskever, Ilya},
	year         = 2016,
	month        = mar,
	publisher    = {arXiv},
	number       = {arXiv:1511.08228},
	url          = {http://arxiv.org/abs/1511.08228},
	note         = {arXiv:1511.08228 [cs]}
}
@inproceedings{Kakwani_Kunchukuttan_Golla_N.C._Bhattacharyya_Khapra_Kumar_2020,
	title        = {IndicNLPSuite: Monolingual Corpora, Evaluation Benchmarks and Pre-trained Multilingual Language Models for Indian Languages},
	author       = {Kakwani, Divyanshu and Kunchukuttan, Anoop and Golla, Satish and N.C., Gokul and Bhattacharyya, Avik and Khapra, Mitesh M. and Kumar, Pratyush},
	year         = 2020,
	booktitle    = {Findings of the Association for Computational Linguistics: EMNLP 2020},
	publisher    = {Association for Computational Linguistics},
	address      = {Online},
	pages        = {4948–4961},
	doi          = {10.18653/v1/2020.findings-emnlp.445},
	url          = {https://www.aclweb.org/anthology/2020.findings-emnlp.445},
	language     = {en}
}
@article{Kalyan_Rajasekharan_Sangeetha_2021,
	title        = {AMMUS : A Survey of Transformer-based Pretrained Models in Natural Language Processing},
	author       = {Kalyan, Katikapalli Subramanyam and Rajasekharan, Ajit and Sangeetha, Sivanesan},
	year         = 2021,
	month        = aug,
	publisher    = {arXiv},
	number       = {arXiv:2108.05542},
	url          = {http://arxiv.org/abs/2108.05542},
	note         = {arXiv:2108.05542 [cs]}
}
@article{Kamiran_Calders_2012,
	title        = {Data preprocessing techniques for classification without discrimination},
	author       = {Kamiran, Faisal and Calders, Toon},
	year         = 2012,
	month        = oct,
	journal      = {Knowledge and Information Systems},
	volume       = 33,
	number       = 1,
	pages        = {1–33},
	doi          = {10.1007/s10115-011-0463-8},
	issn         = {0219-1377, 0219-3116},
	language     = {en}
}
@article{Kang_Raghavan_Bailis_Zaharia,
	title        = {Model Assertions for Debugging Machine Learning},
	author       = {Kang, Daniel and Raghavan, Deepti and Bailis, Peter and Zaharia, Matei},
	language     = {en}
}
@article{Khanuja_Bansal_Mehtani_Khosla_Dey_Gopalan_Margam_Aggarwal_Nagipogu_Dave_etal._2021,
	title        = {MuRIL: Multilingual Representations for Indian Languages},
	author       = {Khanuja, Simran and Bansal, Diksha and Mehtani, Sarvesh and Khosla, Savya and Dey, Atreyee and Gopalan, Balaji and Margam, Dilip Kumar and Aggarwal, Pooja and Nagipogu, Rajiv Teja and Dave, Shachi and Gupta, Shruti and Gali, Subhash Chandra Bose and Subramanian, Vish and Talukdar, Partha},
	year         = 2021,
	month        = apr,
	publisher    = {arXiv},
	number       = {arXiv:2103.10730},
	url          = {http://arxiv.org/abs/2103.10730},
	note         = {arXiv:2103.10730 [cs]}
}
@article{Kitaev_Kaiser_Levskaya_2020,
	title        = {Reformer: The Efficient Transformer},
	author       = {Kitaev, Nikita and Kaiser, Łukasz and Levskaya, Anselm},
	year         = 2020,
	month        = feb,
	publisher    = {arXiv},
	number       = {arXiv:2001.04451},
	url          = {http://arxiv.org/abs/2001.04451},
	note         = {arXiv:2001.04451 [cs, stat]}
}
@article{Kobayashi_Kuribayashi_Yokoi_Inui_2020,
	title        = {Attention is Not Only a Weight: Analyzing Transformers with Vector Norms},
	author       = {Kobayashi, Goro and Kuribayashi, Tatsuki and Yokoi, Sho and Inui, Kentaro},
	year         = 2020,
	month        = oct,
	publisher    = {arXiv},
	number       = {arXiv:2004.10102},
	url          = {http://arxiv.org/abs/2004.10102},
	note         = {arXiv:2004.10102 [cs]}
}
@article{Kudo_Richardson_2018,
	title        = {SentencePiece: A simple and language independent subword tokenizer and detokenizer for Neural Text Processing},
	author       = {Kudo, Taku and Richardson, John},
	year         = 2018,
	month        = aug,
	publisher    = {arXiv},
	number       = {arXiv:1808.06226},
	url          = {http://arxiv.org/abs/1808.06226},
	note         = {arXiv:1808.06226 [cs]}
}
@article{Lai_Xie_Liu_Yang_Hovy_2017,
	title        = {RACE: Large-scale ReAding Comprehension Dataset From Examinations},
	author       = {Lai, Guokun and Xie, Qizhe and Liu, Hanxiao and Yang, Yiming and Hovy, Eduard},
	year         = 2017,
	month        = dec,
	publisher    = {arXiv},
	number       = {arXiv:1704.04683},
	url          = {http://arxiv.org/abs/1704.04683},
	note         = {arXiv:1704.04683 [cs]}
}
@inproceedings{Lapuschkin_Binder_Montavon_Muller_Samek_2016,
	title        = {Analyzing Classifiers: Fisher Vectors and Deep Neural Networks},
	author       = {Lapuschkin, Sebastian and Binder, Alexander and Montavon, Gregoire and Muller, Klaus-Robert and Samek, Wojciech},
	year         = 2016,
	pages        = {2912–2920},
	url          = {https://openaccess.thecvf.com/content_cvpr_2016/html/Bach_Analyzing_Classifiers_Fisher_CVPR_2016_paper.html}
}
@article{Lauscher_Ravishankar_Vulić_Glavaš_2020,
	title        = {From Zero to Hero: On the Limitations of Zero-Shot Cross-Lingual Transfer with Multilingual Transformers},
	author       = {Lauscher, Anne and Ravishankar, Vinit and Vulić, Ivan and Glavaš, Goran},
	year         = 2020,
	month        = may,
	publisher    = {arXiv},
	number       = {arXiv:2005.00633},
	url          = {http://arxiv.org/abs/2005.00633},
	note         = {arXiv:2005.00633 [cs]}
}
@article{LeCun_Boser_Denker_Henderson_Howard_Hubbard_Jackel_1989,
	title        = {Backpropagation Applied to Handwritten Zip Code Recognition},
	author       = {LeCun, Y. and Boser, B. and Denker, J. S. and Henderson, D. and Howard, R. E. and Hubbard, W. and Jackel, L. D.},
	year         = 1989,
	month        = dec,
	journal      = {Neural Computation},
	volume       = 1,
	number       = 4,
	pages        = {541–551},
	doi          = {10.1162/neco.1989.1.4.541},
	issn         = {0899-7667}
}
@inproceedings{Lee_Glass_2012,
	title        = {A Nonparametric Bayesian Approach to Acoustic Model Discovery},
	author       = {Lee, Chia-ying and Glass, James},
	year         = 2012,
	month        = jul,
	booktitle    = {Proceedings of the 50th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
	publisher    = {Association for Computational Linguistics},
	address      = {Jeju Island, Korea},
	pages        = {40–49},
	url          = {https://aclanthology.org/P12-1005},
	editor       = {Li, Haizhou and Lin, Chin-Yew and Osborne, Miles and Lee, Gary Geunbae and Park, Jong C.}
}
@article{Lee_Yoon_Kim_Kim_Kim_So_Kang_2020,
	title        = {BioBERT: a pre-trained biomedical language representation model for biomedical text mining},
	author       = {Lee, Jinhyuk and Yoon, Wonjin and Kim, Sungdong and Kim, Donghyeon and Kim, Sunkyu and So, Chan Ho and Kang, Jaewoo},
	year         = 2020,
	month        = feb,
	journal      = {Bioinformatics},
	volume       = 36,
	number       = 4,
	pages        = {1234–1240},
	doi          = {10.1093/bioinformatics/btz682},
	issn         = {1367-4803}
}
@inproceedings{Lee_Lee_Kim_Kosiorek_Choi_Teh_2019,
	title        = {Set Transformer: A Framework for Attention-based Permutation-Invariant Neural Networks},
	author       = {Lee, Juho and Lee, Yoonho and Kim, Jungtaek and Kosiorek, Adam and Choi, Seungjin and Teh, Yee Whye},
	year         = 2019,
	month        = may,
	booktitle    = {Proceedings of the 36th International Conference on Machine Learning},
	publisher    = {PMLR},
	pages        = {3744–3753},
	issn         = {2640-3498},
	url          = {https://proceedings.mlr.press/v97/lee19d.html},
	language     = {en}
}
@article{Lei_Barzilay_Jaakkola_2016,
	title        = {Rationalizing Neural Predictions},
	author       = {Lei, Tao and Barzilay, Regina and Jaakkola, Tommi},
	year         = 2016,
	month        = nov,
	publisher    = {arXiv},
	number       = {arXiv:1606.04155},
	url          = {http://arxiv.org/abs/1606.04155},
	note         = {arXiv:1606.04155 [cs]}
}
@inproceedings{Letarte_Paradis_Giguère_Laviolette_2018,
	title        = {Importance of Self-Attention for Sentiment Analysis},
	author       = {Letarte, Gaël and Paradis, Frédérik and Giguère, Philippe and Laviolette, François},
	year         = 2018,
	booktitle    = {Proceedings of the 2018 EMNLP Workshop BlackboxNLP: Analyzing and Interpreting Neural Networks for NLP},
	publisher    = {Association for Computational Linguistics},
	address      = {Brussels, Belgium},
	pages        = {267–275},
	doi          = {10.18653/v1/W18-5429},
	url          = {http://aclweb.org/anthology/W18-5429},
	language     = {en}
}
@article{Lewis_Liu_Goyal_Ghazvininejad_Mohamed_Levy_Stoyanov_Zettlemoyer_2019,
	title        = {BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension},
	author       = {Lewis, Mike and Liu, Yinhan and Goyal, Naman and Ghazvininejad, Marjan and Mohamed, Abdelrahman and Levy, Omer and Stoyanov, Ves and Zettlemoyer, Luke},
	year         = 2019,
	month        = oct,
	publisher    = {arXiv},
	number       = {arXiv:1910.13461},
	url          = {http://arxiv.org/abs/1910.13461},
	note         = {arXiv:1910.13461 [cs, stat]}
}
@article{Lewis_Oğuz_Rinott_Riedel_Schwenk_2020,
	title        = {MLQA: Evaluating Cross-lingual Extractive Question Answering},
	author       = {Lewis, Patrick and Oğuz, Barlas and Rinott, Ruty and Riedel, Sebastian and Schwenk, Holger},
	year         = 2020,
	month        = may,
	publisher    = {arXiv},
	number       = {arXiv:1910.07475},
	url          = {http://arxiv.org/abs/1910.07475},
	note         = {arXiv:1910.07475 [cs]}
}
@article{Li_Tu_Yang_Lyu_Zhang_2018,
	title        = {Multi-Head Attention with Disagreement Regularization},
	author       = {Li, Jian and Tu, Zhaopeng and Yang, Baosong and Lyu, Michael R. and Zhang, Tong},
	year         = 2018,
	month        = oct,
	publisher    = {arXiv},
	number       = {arXiv:1810.10183},
	url          = {http://arxiv.org/abs/1810.10183},
	note         = {arXiv:1810.10183 [cs]}
}
@article{Li_Monroe_Jurafsky_2017,
	title        = {Understanding Neural Networks through Representation Erasure},
	author       = {Li, Jiwei and Monroe, Will and Jurafsky, Dan},
	year         = 2017,
	month        = jan,
	publisher    = {arXiv},
	number       = {arXiv:1612.08220},
	url          = {http://arxiv.org/abs/1612.08220},
	note         = {arXiv:1612.08220 [cs]}
}
@article{Li_Liu_Chen_Rudin_2018,
	title        = {Deep Learning for Case-Based Reasoning Through Prototypes: A Neural Network That Explains Its Predictions},
	author       = {Li, Oscar and Liu, Hao and Chen, Chaofan and Rudin, Cynthia},
	year         = 2018,
	month        = apr,
	journal      = {Proceedings of the AAAI Conference on Artificial Intelligence},
	volume       = 32,
	number       = 1,
	doi          = {10.1609/aaai.v32i1.11771},
	issn         = {2374-3468, 2159-5399},
	url          = {https://ojs.aaai.org/index.php/AAAI/article/view/11771}
}
@article{Liang_Duan_Gong_Wu_Guo_Qi_Gong_Shou_Jiang_Cao_etal._2020,
	title        = {XGLUE: A New Benchmark Dataset for Cross-lingual Pre-training, Understanding and Generation},
	author       = {Liang, Yaobo and Duan, Nan and Gong, Yeyun and Wu, Ning and Guo, Fenfei and Qi, Weizhen and Gong, Ming and Shou, Linjun and Jiang, Daxin and Cao, Guihong and Fan, Xiaodong and Zhang, Ruofei and Agrawal, Rahul and Cui, Edward and Wei, Sining and Bharti, Taroon and Qiao, Ying and Chen, Jiun-Hung and Wu, Winnie and Liu, Shuguang and Yang, Fan and Campos, Daniel and Majumder, Rangan and Zhou, Ming},
	year         = 2020,
	month        = may,
	publisher    = {arXiv},
	number       = {arXiv:2004.01401},
	url          = {http://arxiv.org/abs/2004.01401},
	note         = {arXiv:2004.01401 [cs]}
}
@article{Libovický_Rosa_Fraser_2019,
	title        = {How Language-Neutral is Multilingual BERT?},
	author       = {Libovický, Jindřich and Rosa, Rudolf and Fraser, Alexander},
	year         = 2019,
	month        = nov,
	publisher    = {arXiv},
	number       = {arXiv:1911.03310},
	url          = {http://arxiv.org/abs/1911.03310},
	note         = {arXiv:1911.03310 [cs]}
}
@inproceedings{Limisiewicz_Rosa_Mareček_2020,
	title        = {Universal Dependencies according to BERT: both more specific and more general},
	author       = {Limisiewicz, Tomasz and Rosa, Rudolf and Mareček, David},
	year         = 2020,
	booktitle    = {Findings of the Association for Computational Linguistics: EMNLP 2020},
	pages        = {2710–2722},
	doi          = {10.18653/v1/2020.findings-emnlp.245},
	url          = {http://arxiv.org/abs/2004.14620},
	note         = {arXiv:2004.14620 [cs]}
}
@article{Lin_Wang_Liu_Qiu_2021,
	title        = {A Survey of Transformers},
	author       = {Lin, Tianyang and Wang, Yuxin and Liu, Xiangyang and Qiu, Xipeng},
	year         = 2021,
	month        = jun,
	publisher    = {arXiv},
	number       = {arXiv:2106.04554},
	url          = {http://arxiv.org/abs/2106.04554},
	note         = {arXiv:2106.04554 [cs]}
}
@article{Liu_Hsu_Chuang_Lee_2020,
	title        = {A Study of Cross-Lingual Ability and Language-specific Information in Multilingual BERT},
	author       = {Liu, Chi-Liang and Hsu, Tsung-Yuan and Chuang, Yung-Sung and Lee, Hung-Yi},
	year         = 2020,
	month        = apr,
	publisher    = {arXiv},
	number       = {arXiv:2004.09205},
	url          = {http://arxiv.org/abs/2004.09205},
	note         = {arXiv:2004.09205 [cs]}
}
@article{Liu_Yin_Wang_2019,
	title        = {Towards Explainable NLP: A Generative Explanation Framework for Text Classification},
	author       = {Liu, Hui and Yin, Qingyu and Wang, William Yang},
	year         = 2019,
	month        = jun,
	publisher    = {arXiv},
	number       = {arXiv:1811.00196},
	url          = {http://arxiv.org/abs/1811.00196},
	note         = {arXiv:1811.00196 [cs]}
}
@article{Liu_Saleh_Pot_Goodrich_Sepassi_Kaiser_Shazeer_2018,
	title        = {Generating Wikipedia by Summarizing Long Sequences},
	author       = {Liu, Peter J. and Saleh, Mohammad and Pot, Etienne and Goodrich, Ben and Sepassi, Ryan and Kaiser, Lukasz and Shazeer, Noam},
	year         = 2018,
	month        = jan,
	publisher    = {arXiv},
	number       = {arXiv:1801.10198},
	url          = {http://arxiv.org/abs/1801.10198},
	note         = {arXiv:1801.10198 [cs]}
}
@book{Liu_McCarthy_Vulić_Korhonen_2019,
	title        = {Investigating cross-lingual alignment methods for contextualized embeddings with Token-level evaluation},
	author       = {Liu, Q. and McCarthy, D. and Vulić, I. and Korhonen, A.},
	year         = 2019,
	month        = jan,
	isbn         = {978-1-950737-72-7},
	url          = {https://www.repository.cam.ac.uk/handle/1810/297000},
	language     = {eng}
}
@article{Liu_Gu_Goyal_Li_Edunov_Ghazvininejad_Lewis_Zettlemoyer_2020,
	title        = {Multilingual Denoising Pre-training for Neural Machine Translation},
	author       = {Liu, Yinhan and Gu, Jiatao and Goyal, Naman and Li, Xian and Edunov, Sergey and Ghazvininejad, Marjan and Lewis, Mike and Zettlemoyer, Luke},
	year         = 2020,
	month        = nov,
	journal      = {Transactions of the Association for Computational Linguistics},
	volume       = 8,
	pages        = {726–742},
	doi          = {10.1162/tacl_a_00343},
	issn         = {2307-387X}
}
@article{Liu_Ott_Goyal_Du_Joshi_Chen_Levy_Lewis_Zettlemoyer_Stoyanov_2019,
	title        = {RoBERTa: A Robustly Optimized BERT Pretraining Approach},
	author       = {Liu, Yinhan and Ott, Myle and Goyal, Naman and Du, Jingfei and Joshi, Mandar and Chen, Danqi and Levy, Omer and Lewis, Mike and Zettlemoyer, Luke and Stoyanov, Veselin},
	year         = 2019,
	month        = jul,
	publisher    = {arXiv},
	number       = {arXiv:1907.11692},
	url          = {http://arxiv.org/abs/1907.11692},
	note         = {arXiv:1907.11692 [cs]}
}
@article{Liu_Winata_Madotto_Fung_2020,
	title        = {Exploring Fine-tuning Techniques for Pre-trained Cross-lingual Models via Continual Learning},
	author       = {Liu, Zihan and Winata, Genta Indra and Madotto, Andrea and Fung, Pascale},
	year         = 2020,
	month        = oct,
	publisher    = {arXiv},
	number       = {arXiv:2004.14218},
	url          = {http://arxiv.org/abs/2004.14218},
	note         = {arXiv:2004.14218 [cs]}
}
@article{Loshchilov_Hutter_2019,
	title        = {Decoupled Weight Decay Regularization},
	author       = {Loshchilov, Ilya and Hutter, Frank},
	year         = 2019,
	month        = jan,
	publisher    = {arXiv},
	number       = {arXiv:1711.05101},
	url          = {http://arxiv.org/abs/1711.05101},
	note         = {arXiv:1711.05101 [cs, math]}
}
@inproceedings{Lu_Subburathinam_Ji_May_Chang_Sil_Voss_2020,
	title        = {Cross-lingual Structure Transfer for Zero-resource Event Extraction},
	author       = {Lu, Di and Subburathinam, Ananya and Ji, Heng and May, Jonathan and Chang, Shih-Fu and Sil, Avi and Voss, Clare},
	year         = 2020,
	month        = may,
	booktitle    = {Proceedings of the Twelfth Language Resources and Evaluation Conference},
	publisher    = {European Language Resources Association},
	address      = {Marseille, France},
	pages        = {1976–1981},
	isbn         = {979-10-95546-34-4},
	url          = {https://aclanthology.org/2020.lrec-1.243},
	editor       = {Calzolari, Nicoletta and Béchet, Frédéric and Blache, Philippe and Choukri, Khalid and Cieri, Christopher and Declerck, Thierry and Goggi, Sara and Isahara, Hitoshi and Maegaard, Bente and Mariani, Joseph and Mazo, Hélène and Moreno, Asuncion and Odijk, Jan and Piperidis, Stelios},
	language     = {English}
}
@article{Lu_Batra_Parikh_Lee_2019,
	title        = {ViLBERT: Pretraining Task-Agnostic Visiolinguistic Representations for Vision-and-Language Tasks},
	author       = {Lu, Jiasen and Batra, Dhruv and Parikh, Devi and Lee, Stefan},
	year         = 2019,
	month        = aug,
	publisher    = {arXiv},
	number       = {arXiv:1908.02265},
	url          = {http://arxiv.org/abs/1908.02265},
	note         = {arXiv:1908.02265 [cs]}
}
@article{Lu_Li_He_Sun_Dong_Qin_Wang_Liu_2019,
	title        = {Understanding and Improving Transformer From a Multi-Particle Dynamic System Point of View},
	author       = {Lu, Yiping and Li, Zhuohan and He, Di and Sun, Zhiqing and Dong, Bin and Qin, Tao and Wang, Liwei and Liu, Tie-Yan},
	year         = 2019,
	month        = jun,
	publisher    = {arXiv},
	number       = {arXiv:1906.02762},
	url          = {http://arxiv.org/abs/1906.02762},
	note         = {arXiv:1906.02762 [cs, stat]}
}
@article{Luo_Wang_Liu_Liu_Bi_Huang_Huang_Si_2020,
	title        = {VECO: Variable Encoder-decoder Pre-training for Cross-lingual Understanding and Generation},
	author       = {Luo, Fuli and Wang, Wei and Liu, Jiahao and Liu, Yijia and Bi, Bin and Huang, Songfang and Huang, Fei and Si, Luo},
	year         = 2020,
	month        = oct,
	url          = {https://openreview.net/forum?id=YjNv-hzM8BE},
	language     = {en}
}
@article{Luong_Pham_Manning_2015,
	title        = {Effective Approaches to Attention-based Neural Machine Translation},
	author       = {Luong, Minh-Thang and Pham, Hieu and Manning, Christopher D.},
	year         = 2015,
	month        = sep,
	publisher    = {arXiv},
	number       = {arXiv:1508.04025},
	url          = {http://arxiv.org/abs/1508.04025},
	note         = {arXiv:1508.04025 [cs]}
}
@article{Ma_Kong_Wang_Zhou_May_Ma_Zettlemoyer_2021,
	title        = {Luna: Linear Unified Nested Attention},
	author       = {Ma, Xuezhe and Kong, Xiang and Wang, Sinong and Zhou, Chunting and May, Jonathan and Ma, Hao and Zettlemoyer, Luke},
	year         = 2021,
	month        = nov,
	publisher    = {arXiv},
	number       = {arXiv:2106.01540},
	url          = {http://arxiv.org/abs/2106.01540},
	note         = {arXiv:2106.01540 [cs]}
}
@article{Madry_Makelov_Schmidt_Tsipras_Vladu_2019,
	title        = {Towards Deep Learning Models Resistant to Adversarial Attacks},
	author       = {Madry, Aleksander and Makelov, Aleksandar and Schmidt, Ludwig and Tsipras, Dimitris and Vladu, Adrian},
	year         = 2019,
	month        = sep,
	publisher    = {arXiv},
	number       = {arXiv:1706.06083},
	url          = {http://arxiv.org/abs/1706.06083},
	note         = {arXiv:1706.06083 [cs, stat]}
}
@article{McCulloch_Pitts_1943,
	title        = {A logical calculus of the ideas immanent in nervous activity},
	author       = {McCulloch, Warren S. and Pitts, Walter},
	year         = 1943,
	month        = dec,
	journal      = {The Bulletin of Mathematical Biophysics},
	volume       = 5,
	number       = 4,
	pages        = {115–133},
	doi          = {10.1007/BF02478259},
	issn         = {0007-4985, 1522-9602},
	rights       = {http://www.springer.com/tdm},
	language     = {en}
}
@article{Mehta_Ghazvininejad_Iyer_Zettlemoyer_Hajishirzi_2021,
	title        = {DeLighT: Deep and Light-weight Transformer},
	author       = {Mehta, Sachin and Ghazvininejad, Marjan and Iyer, Srinivasan and Zettlemoyer, Luke and Hajishirzi, Hannaneh},
	year         = 2021,
	month        = feb,
	publisher    = {arXiv},
	number       = {arXiv:2008.00623},
	url          = {http://arxiv.org/abs/2008.00623},
	note         = {arXiv:2008.00623 [cs]}
}
@article{Mehta_Koncel-Kedziorski_Rastegari_Hajishirzi_2018,
	title        = {Pyramidal Recurrent Unit for Language Modeling},
	author       = {Mehta, Sachin and Koncel-Kedziorski, Rik and Rastegari, Mohammad and Hajishirzi, Hannaneh},
	year         = 2018,
	month        = aug,
	publisher    = {arXiv},
	number       = {arXiv:1808.09029},
	url          = {http://arxiv.org/abs/1808.09029},
	note         = {arXiv:1808.09029 [cs]}
}
@article{Mehta_Koncel-Kedziorski_Rastegari_Hajishirzi_2020,
	title        = {DeFINE: DEep Factorized INput Token Embeddings for Neural Sequence Modeling},
	author       = {Mehta, Sachin and Koncel-Kedziorski, Rik and Rastegari, Mohammad and Hajishirzi, Hannaneh},
	year         = 2020,
	month        = feb,
	publisher    = {arXiv},
	number       = {arXiv:1911.12385},
	url          = {http://arxiv.org/abs/1911.12385},
	note         = {arXiv:1911.12385 [cs]}
}
@article{Mikolov_Chen_Corrado_Dean_2013,
	title        = {Efficient Estimation of Word Representations in Vector Space},
	author       = {Mikolov, Tomas and Chen, Kai and Corrado, Greg and Dean, Jeffrey},
	year         = 2013,
	month        = sep,
	publisher    = {arXiv},
	number       = {arXiv:1301.3781},
	url          = {http://arxiv.org/abs/1301.3781},
	note         = {arXiv:1301.3781 [cs]}
}
@inproceedings{Mikolov_Karafiát_Burget_Černocký_Khudanpur_2010,
	title        = {Recurrent neural network based language model},
	author       = {Mikolov, Tomáš and Karafiát, Martin and Burget, Lukáš and Černocký, Jan and Khudanpur, Sanjeev},
	year         = 2010,
	month        = sep,
	booktitle    = {Interspeech 2010},
	publisher    = {ISCA},
	pages        = {1045–1048},
	doi          = {10.21437/Interspeech.2010-343},
	url          = {https://www.isca-archive.org/interspeech_2010/mikolov10_interspeech.html},
	language     = {en}
}
@inproceedings{Mikolov_Sutskever_Chen_Corrado_Dean_2013,
	title        = {Distributed Representations of Words and Phrases and their Compositionality},
	author       = {Mikolov, Tomas and Sutskever, Ilya and Chen, Kai and Corrado, Greg S and Dean, Jeff},
	year         = 2013,
	booktitle    = {Advances in Neural Information Processing Systems},
	publisher    = {Curran Associates, Inc.},
	volume       = 26,
	url          = {https://proceedings.neurips.cc/paper/2013/hash/9aa42b31882ec039965f3c4923ce901b-Abstract.html}
}
@book{Minsky_Papert_1972,
	title        = {Perceptrons: an introduction to computational geometry},
	author       = {Minsky, Marvin and Papert, Seymour A.},
	year         = 1972,
	publisher    = {The MIT Press},
	address      = {Cambridge/Mass.},
	isbn         = {978-0-262-63022-1},
	edition      = {2. print. with corr},
	language     = {eng}
}
@article{Montavon_Lapuschkin_Binder_Samek_Müller_2017,
	title        = {Explaining nonlinear classification decisions with deep Taylor decomposition},
	author       = {Montavon, Grégoire and Lapuschkin, Sebastian and Binder, Alexander and Samek, Wojciech and Müller, Klaus-Robert},
	year         = 2017,
	month        = may,
	journal      = {Pattern Recognition},
	volume       = 65,
	pages        = {211–222},
	doi          = {10.1016/j.patcog.2016.11.008},
	issn         = {0031-3203}
}
@article{Montavon_Samek_Müller_2018,
	title        = {Methods for interpreting and understanding deep neural networks},
	author       = {Montavon, Grégoire and Samek, Wojciech and Müller, Klaus-Robert},
	year         = 2018,
	month        = feb,
	journal      = {Digital Signal Processing},
	volume       = 73,
	pages        = {1–15},
	doi          = {10.1016/j.dsp.2017.10.011},
	issn         = {1051-2004}
}
@inproceedings{Moosavi-Dezfooli_Fawzi_Fawzi_Frossard_2017,
	title        = {Universal Adversarial Perturbations},
	author       = {Moosavi-Dezfooli, Seyed-Mohsen and Fawzi, Alhussein and Fawzi, Omar and Frossard, Pascal},
	year         = 2017,
	pages        = {1765–1773},
	url          = {https://openaccess.thecvf.com/content_cvpr_2017/html/Moosavi-Dezfooli_Universal_Adversarial_Perturbations_CVPR_2017_paper.html}
}
@inproceedings{Mostafazadeh_Roth_Louis_Chambers_Allen_2017,
	title        = {LSDSem 2017 Shared Task: The Story Cloze Test},
	author       = {Mostafazadeh, Nasrin and Roth, Michael and Louis, Annie and Chambers, Nathanael and Allen, James},
	year         = 2017,
	booktitle    = {Proceedings of the 2nd Workshop on Linking Models of Lexical,           Sentential and Discourse-level Semantics},
	publisher    = {Association for Computational Linguistics},
	address      = {Valencia, Spain},
	pages        = {46–51},
	doi          = {10.18653/v1/W17-0906},
	url          = {http://aclweb.org/anthology/W17-0906},
	language     = {en}
}
@inproceedings{Nguyen_Yosinski_Clune_2015,
	title        = {Deep Neural Networks Are Easily Fooled: High Confidence Predictions for Unrecognizable Images},
	author       = {Nguyen, Anh and Yosinski, Jason and Clune, Jeff},
	year         = 2015,
	pages        = {427–436},
	url          = {https://www.cv-foundation.org/openaccess/content_cvpr_2015/html/Nguyen_Deep_Neural_Networks_2015_CVPR_paper.html}
}
@article{Oord_Li_Vinyals_2019,
	title        = {Representation Learning with Contrastive Predictive Coding},
	author       = {Oord, Aaron van den and Li, Yazhe and Vinyals, Oriol},
	year         = 2019,
	month        = jan,
	publisher    = {arXiv},
	number       = {arXiv:1807.03748},
	url          = {http://arxiv.org/abs/1807.03748},
	note         = {arXiv:1807.03748 [cs, stat]}
}
@article{Ouyang_Wang_Pang_Sun_Tian_Wu_Wang_2021,
	title        = {ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora},
	author       = {Ouyang, Xuan and Wang, Shuohuan and Pang, Chao and Sun, Yu and Tian, Hao and Wu, Hua and Wang, Haifeng},
	year         = 2021,
	month        = sep,
	publisher    = {arXiv},
	number       = {arXiv:2012.15674},
	url          = {http://arxiv.org/abs/2012.15674},
	note         = {arXiv:2012.15674 [cs]}
}
@inproceedings{Park_Hendricks_Akata_Rohrbach_Schiele_Darrell_Rohrbach_2018,
	title        = {Multimodal Explanations: Justifying Decisions and Pointing to the Evidence},
	author       = {Park, Dong Huk and Hendricks, Lisa Anne and Akata, Zeynep and Rohrbach, Anna and Schiele, Bernt and Darrell, Trevor and Rohrbach, Marcus},
	year         = 2018,
	pages        = {8779–8788},
	url          = {https://openaccess.thecvf.com/content_cvpr_2018/html/Park_Multimodal_Explanations_Justifying_CVPR_2018_paper.html}
}
@article{Partanen_Poibeau_Rießler,
	title        = {Developing Technologies for the Documentation and Description of the Low-resource Uralic Languages Zyrian Komi and North Saami},
	author       = {Partanen, Niko and Poibeau, Thierry and Rießler, Michael},
	language     = {en}
}
@article{Phang_Calixto_Htut_Pruksachatkun_Liu_Vania_Kann_Bowman_2020,
	title        = {English Intermediate-Task Training Improves Zero-Shot Cross-Lingual Transfer Too},
	author       = {Phang, Jason and Calixto, Iacer and Htut, Phu Mon and Pruksachatkun, Yada and Liu, Haokun and Vania, Clara and Kann, Katharina and Bowman, Samuel R.},
	year         = 2020,
	month        = sep,
	publisher    = {arXiv},
	number       = {arXiv:2005.13013},
	url          = {http://arxiv.org/abs/2005.13013},
	note         = {arXiv:2005.13013 [cs]}
}
@article{Pires_Schlinger_Garrette_2019,
	title        = {How multilingual is Multilingual BERT?},
	author       = {Pires, Telmo and Schlinger, Eva and Garrette, Dan},
	year         = 2019,
	month        = jun,
	publisher    = {arXiv},
	number       = {arXiv:1906.01502},
	url          = {http://arxiv.org/abs/1906.01502},
	note         = {arXiv:1906.01502 [cs]}
}
@article{Ponti_Glavaš_Majewska_Liu_Vulić_Korhonen_2020,
	title        = {XCOPA: A Multilingual Dataset for Causal Commonsense Reasoning},
	author       = {Ponti, Edoardo Maria and Glavaš, Goran and Majewska, Olga and Liu, Qianchu and Vulić, Ivan and Korhonen, Anna},
	year         = 2020,
	month        = oct,
	publisher    = {arXiv},
	number       = {arXiv:2005.00333},
	url          = {http://arxiv.org/abs/2005.00333},
	note         = {arXiv:2005.00333 [cs]}
}
@article{Press_Smith_Levy_2020,
	title        = {Improving Transformer Models by Reordering their Sublayers},
	author       = {Press, Ofir and Smith, Noah A. and Levy, Omer},
	year         = 2020,
	month        = apr,
	publisher    = {arXiv},
	number       = {arXiv:1911.03864},
	doi          = {10.48550/arXiv.1911.03864},
	url          = {http://arxiv.org/abs/1911.03864},
	note         = {arXiv:1911.03864 [cs]}
}
@article{Radford_Narasimhan_Salimans_Sutskever,
	title        = {Improving Language Understanding by Generative Pre-Training},
	author       = {Radford, Alec and Narasimhan, Karthik and Salimans, Tim and Sutskever, Ilya},
	language     = {en}
}
@article{Radford_Wu_Child_Luan_Amodei_Sutskever,
	title        = {Language Models are Unsupervised Multitask Learners},
	author       = {Radford, Alec and Wu, Jeffrey and Child, Rewon and Luan, David and Amodei, Dario and Sutskever, Ilya},
	language     = {en}
}
@article{Rae_Potapenko_Jayakumar_Lillicrap_2019,
	title        = {Compressive Transformers for Long-Range Sequence Modelling},
	author       = {Rae, Jack W. and Potapenko, Anna and Jayakumar, Siddhant M. and Lillicrap, Timothy P.},
	year         = 2019,
	month        = nov,
	publisher    = {arXiv},
	number       = {arXiv:1911.05507},
	url          = {http://arxiv.org/abs/1911.05507},
	note         = {arXiv:1911.05507 [cs, stat]}
}
@article{Raffel_Shazeer_Roberts_Lee_Narang_Matena_Zhou_Li_Liu_2023,
	title        = {Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer},
	author       = {Raffel, Colin and Shazeer, Noam and Roberts, Adam and Lee, Katherine and Narang, Sharan and Matena, Michael and Zhou, Yanqi and Li, Wei and Liu, Peter J.},
	year         = 2023,
	month        = sep,
	publisher    = {arXiv},
	number       = {arXiv:1910.10683},
	url          = {http://arxiv.org/abs/1910.10683},
	note         = {arXiv:1910.10683 [cs, stat]}
}
@article{Raghu_Gilmer_Yosinski_Sohl-Dickstein_2017,
	title        = {SVCCA: Singular Vector Canonical Correlation Analysis for Deep Learning Dynamics and Interpretability},
	author       = {Raghu, Maithra and Gilmer, Justin and Yosinski, Jason and Sohl-Dickstein, Jascha},
	year         = 2017,
	month        = nov,
	publisher    = {arXiv},
	number       = {arXiv:1706.05806},
	url          = {http://arxiv.org/abs/1706.05806},
	note         = {arXiv:1706.05806 [cs, stat]}
}
@article{Ras_van_Gerven_Haselager_2018,
	title        = {Explanation Methods in Deep Learning: Users, Values, Concerns and Challenges},
	author       = {Ras, Gabrielle and van Gerven, Marcel and Haselager, Pim},
	year         = 2018,
	month        = mar,
	publisher    = {arXiv},
	number       = {arXiv:1803.07517},
	doi          = {10.48550/arXiv.1803.07517},
	url          = {http://arxiv.org/abs/1803.07517},
	note         = {arXiv:1803.07517 [cs, stat]}
}
@article{Ras_Xie_van_Gerven_Doran_2021,
	title        = {Explainable Deep Learning: A Field Guide for the Uninitiated},
	author       = {Ras, Gabrielle and Xie, Ning and van Gerven, Marcel and Doran, Derek},
	year         = 2021,
	month        = sep,
	publisher    = {arXiv},
	number       = {arXiv:2004.14545},
	url          = {http://arxiv.org/abs/2004.14545},
	note         = {arXiv:2004.14545 [cs, stat]}
}
@inproceedings{Ribeiro_Singh_Guestrin_2016,
	title        = {“Why Should I Trust You?”: Explaining the Predictions of Any Classifier},
	author       = {Ribeiro, Marco Tulio and Singh, Sameer and Guestrin, Carlos},
	year         = 2016,
	booktitle    = {Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining},
	publisher    = {Association for Computing Machinery},
	address      = {New York, NY, USA},
	series       = {KDD ’16},
	pages        = {1135–1144},
	doi          = {10.1145/2939672.2939778},
	isbn         = {978-1-4503-4232-2},
	url          = {https://dl.acm.org/doi/10.1145/2939672.2939778},
	collection   = {KDD ’16}
}
@article{Ribeiro_Singh_Guestrin_2018,
	title        = {Anchors: High-Precision Model-Agnostic Explanations},
	author       = {Ribeiro, Marco Tulio and Singh, Sameer and Guestrin, Carlos},
	year         = 2018,
	month        = apr,
	journal      = {Proceedings of the AAAI Conference on Artificial Intelligence},
	volume       = 32,
	number       = 11,
	doi          = {10.1609/aaai.v32i1.11491},
	issn         = {2374-3468},
	url          = {https://ojs.aaai.org/index.php/AAAI/article/view/11491},
	rights       = {Copyright (c)},
	language     = {en}
}
@article{Rocktäschel_Grefenstette_Hermann_Kočiský_Blunsom_2016,
	title        = {Reasoning about Entailment with Neural Attention},
	author       = {Rocktäschel, Tim and Grefenstette, Edward and Hermann, Karl Moritz and Kočiský, Tomáš and Blunsom, Phil},
	year         = 2016,
	month        = mar,
	publisher    = {arXiv},
	number       = {arXiv:1509.06664},
	url          = {http://arxiv.org/abs/1509.06664},
	note         = {arXiv:1509.06664 [cs]}
}
@article{Rosenblatt_1958,
	title        = {The perceptron: A probabilistic model for information storage and organization in the brain.},
	author       = {Rosenblatt, F.},
	year         = 1958,
	journal      = {Psychological Review},
	volume       = 65,
	number       = 6,
	pages        = {386–408},
	doi          = {10.1037/h0042519},
	issn         = {1939-1471, 0033-295X},
	language     = {en}
}
@article{Rothman,
	title        = {Transformers for natural language processing : build innovative deep neural network architectures for NLP with Python, PyTorch, TensorFlow, BERT, RoBERTa, and more},
	author       = {Rothman, Denis},
	journal      = {(No Title)},
	url          = {https://cir.nii.ac.jp/crid/1130287204219740806},
	language     = {en}
}
@article{Roy_Saffar_Vaswani_Grangier_2021,
	title        = {Efficient Content-Based Sparse Attention with Routing Transformers},
	author       = {Roy, Aurko and Saffar, Mohammad and Vaswani, Ashish and Grangier, David},
	year         = 2021,
	month        = feb,
	journal      = {Transactions of the Association for Computational Linguistics},
	volume       = 9,
	pages        = {53–68},
	doi          = {10.1162/tacl_a_00353},
	issn         = {2307-387X},
	language     = {en}
}
@article{Roy_Constant_Al-Rfou_Barua_Phillips_Yang_2020,
	title        = {LAReQA: Language-agnostic answer retrieval from a multilingual pool},
	author       = {Roy, Uma and Constant, Noah and Al-Rfou, Rami and Barua, Aditya and Phillips, Aaron and Yang, Yinfei},
	year         = 2020,
	month        = apr,
	publisher    = {arXiv},
	number       = {arXiv:2004.05484},
	doi          = {10.48550/arXiv.2004.05484},
	url          = {http://arxiv.org/abs/2004.05484},
	note         = {arXiv:2004.05484 [cs]}
}
@article{Ruder_Constant_Botha_Siddhant_Firat_Fu_Liu_Hu_Garrette_Neubig_etal._2021,
	title        = {XTREME-R: Towards More Challenging and Nuanced Multilingual Evaluation},
	author       = {Ruder, Sebastian and Constant, Noah and Botha, Jan and Siddhant, Aditya and Firat, Orhan and Fu, Jinlan and Liu, Pengfei and Hu, Junjie and Garrette, Dan and Neubig, Graham and Johnson, Melvin},
	year         = 2021,
	month        = oct,
	publisher    = {arXiv},
	number       = {arXiv:2104.07412},
	url          = {http://arxiv.org/abs/2104.07412},
	note         = {arXiv:2104.07412 [cs]}
}
@article{Rumelhart_Hinton_Williams_1986,
	title        = {Learning representations by back-propagating errors},
	author       = {Rumelhart, David E. and Hinton, Geoffrey E. and Williams, Ronald J.},
	year         = 1986,
	month        = oct,
	journal      = {Nature},
	volume       = 323,
	number       = 6088,
	pages        = {533–536},
	doi          = {10.1038/323533a0},
	issn         = {0028-0836, 1476-4687},
	rights       = {http://www.springer.com/tdm},
	language     = {en}
}
@article{S_1993,
	title        = {TIMIT Acoustic Phonetic Continuous Speech Corpus},
	author       = {S, Garofolo J.},
	year         = 1993,
	journal      = {Linguistic Data Consortium, 1993},
	url          = {https://cir.nii.ac.jp/crid/1571135651331964544}
}
@article{Samangouei_Kabkab_Chellappa_2018,
	title        = {Defense-GAN: Protecting Classifiers Against Adversarial Attacks Using Generative Models},
	author       = {Samangouei, Pouya and Kabkab, Maya and Chellappa, Rama},
	year         = 2018,
	month        = may,
	publisher    = {arXiv},
	number       = {arXiv:1805.06605},
	url          = {http://arxiv.org/abs/1805.06605},
	note         = {arXiv:1805.06605 [cs, stat]}
}
@article{Sang_2002,
	title        = {Introduction to the CoNLL-2002 Shared Task: Language-Independent Named Entity Recognition},
	author       = {Sang, Erik F. Tjong Kim},
	year         = 2002,
	month        = sep,
	publisher    = {arXiv},
	number       = {arXiv:cs/0209010},
	doi          = {10.48550/arXiv.cs/0209010},
	url          = {http://arxiv.org/abs/cs/0209010},
	note         = {arXiv:cs/0209010}
}
@article{Sang_De_Meulder_2003,
	title        = {Introduction to the CoNLL-2003 Shared Task: Language-Independent Named Entity Recognition},
	author       = {Sang, Erik F. Tjong Kim and De Meulder, Fien},
	year         = 2003,
	month        = jun,
	publisher    = {arXiv},
	number       = {arXiv:cs/0306050},
	doi          = {10.48550/arXiv.cs/0306050},
	url          = {http://arxiv.org/abs/cs/0306050},
	note         = {arXiv:cs/0306050}
}
@article{Schneider_Baevski_Collobert_Auli_2019,
	title        = {wav2vec: Unsupervised Pre-training for Speech Recognition},
	author       = {Schneider, Steffen and Baevski, Alexei and Collobert, Ronan and Auli, Michael},
	year         = 2019,
	month        = sep,
	publisher    = {arXiv},
	number       = {arXiv:1904.05862},
	doi          = {10.48550/arXiv.1904.05862},
	url          = {http://arxiv.org/abs/1904.05862},
	note         = {arXiv:1904.05862 [cs]}
}
@inproceedings{Schuster_Nakajima_2012,
	title        = {Japanese and Korean voice search},
	author       = {Schuster, Mike and Nakajima, Kaisuke},
	year         = 2012,
	month        = mar,
	booktitle    = {2012 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
	publisher    = {IEEE},
	address      = {Kyoto, Japan},
	pages        = {5149–5152},
	doi          = {10.1109/ICASSP.2012.6289079},
	isbn         = {978-1-4673-0046-9},
	url          = {http://ieeexplore.ieee.org/document/6289079/}
}
@article{Schwenk_Li_2018,
	title        = {A Corpus for Multilingual Document Classification in Eight Languages},
	author       = {Schwenk, Holger and Li, Xian},
	year         = 2018,
	month        = may,
	publisher    = {arXiv},
	number       = {arXiv:1805.09821},
	doi          = {10.48550/arXiv.1805.09821},
	url          = {http://arxiv.org/abs/1805.09821},
	note         = {arXiv:1805.09821 [cs]}
}
@article{Sennrich_Haddow_Birch_2016a,
	title        = {Improving Neural Machine Translation Models with Monolingual Data},
	author       = {Sennrich, Rico and Haddow, Barry and Birch, Alexandra},
	year         = 2016,
	month        = jun,
	publisher    = {arXiv},
	number       = {arXiv:1511.06709},
	doi          = {10.48550/arXiv.1511.06709},
	url          = {http://arxiv.org/abs/1511.06709},
	note         = {arXiv:1511.06709 [cs]}
}
@article{Sennrich_Haddow_Birch_2016b,
	title        = {Neural Machine Translation of Rare Words with Subword Units},
	author       = {Sennrich, Rico and Haddow, Barry and Birch, Alexandra},
	year         = 2016,
	month        = jun,
	publisher    = {arXiv},
	number       = {arXiv:1508.07909},
	doi          = {10.48550/arXiv.1508.07909},
	url          = {http://arxiv.org/abs/1508.07909},
	note         = {arXiv:1508.07909 [cs]}
}
@article{Shaw_Uszkoreit_Vaswani_2018,
	title        = {Self-Attention with Relative Position Representations},
	author       = {Shaw, Peter and Uszkoreit, Jakob and Vaswani, Ashish},
	year         = 2018,
	month        = apr,
	publisher    = {arXiv},
	number       = {arXiv:1803.02155},
	doi          = {10.48550/arXiv.1803.02155},
	url          = {http://arxiv.org/abs/1803.02155},
	note         = {arXiv:1803.02155 [cs]}
}
@article{Shazeer_2019,
	title        = {Fast Transformer Decoding: One Write-Head is All You Need},
	author       = {Shazeer, Noam},
	year         = 2019,
	month        = nov,
	publisher    = {arXiv},
	number       = {arXiv:1911.02150},
	url          = {http://arxiv.org/abs/1911.02150},
	note         = {arXiv:1911.02150 [cs]}
}
@article{Shazeer_Lan_Cheng_Ding_Hou_2020,
	title        = {Talking-Heads Attention},
	author       = {Shazeer, Noam and Lan, Zhenzhong and Cheng, Youlong and Ding, Nan and Hou, Le},
	year         = 2020,
	month        = mar,
	publisher    = {arXiv},
	number       = {arXiv:2003.02436},
	url          = {http://arxiv.org/abs/2003.02436},
	note         = {arXiv:2003.02436 [cs, eess, stat]}
}
@article{Shazeer_Mirhoseini_Maziarz_Davis_Le_Hinton_Dean_2017,
	title        = {Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer},
	author       = {Shazeer, Noam and Mirhoseini, Azalia and Maziarz, Krzysztof and Davis, Andy and Le, Quoc and Hinton, Geoffrey and Dean, Jeff},
	year         = 2017,
	month        = jan,
	publisher    = {arXiv},
	number       = {arXiv:1701.06538},
	url          = {http://arxiv.org/abs/1701.06538},
	note         = {arXiv:1701.06538 [cs, stat]}
}
@inproceedings{Shrikumar_Greenside_Kundaje_2017,
	title        = {Learning Important Features Through Propagating Activation Differences},
	author       = {Shrikumar, Avanti and Greenside, Peyton and Kundaje, Anshul},
	year         = 2017,
	month        = jul,
	booktitle    = {Proceedings of the 34th International Conference on Machine Learning},
	publisher    = {PMLR},
	pages        = {3145–3153},
	issn         = {2640-3498},
	url          = {https://proceedings.mlr.press/v70/shrikumar17a.html},
	language     = {en}
}
@inproceedings{Singh_McCann_Socher_Xiong_2019,
	title        = {BERT is Not an Interlingua and the Bias of Tokenization},
	author       = {Singh, Jasdeep and McCann, Bryan and Socher, Richard and Xiong, Caiming},
	year         = 2019,
	month        = nov,
	booktitle    = {Proceedings of the 2nd Workshop on Deep Learning Approaches for Low-Resource NLP (DeepLo 2019)},
	publisher    = {Association for Computational Linguistics},
	address      = {Hong Kong, China},
	pages        = {47–55},
	doi          = {10.18653/v1/D19-6106},
	url          = {https://aclanthology.org/D19-6106},
	editor       = {Cherry, Colin and Durrett, Greg and Foster, George and Haffari, Reza and Khadivi, Shahram and Peng, Nanyun and Ren, Xiang and Swayamdipta, Swabha}
}
@inproceedings{Socher_Perelygin_Wu_Chuang_Manning_Ng_Potts_2013,
	title        = {Recursive Deep Models for Semantic Compositionality Over a Sentiment Treebank},
	author       = {Socher, Richard and Perelygin, Alex and Wu, Jean and Chuang, Jason and Manning, Christopher D. and Ng, Andrew and Potts, Christopher},
	year         = 2013,
	month        = oct,
	booktitle    = {Proceedings of the 2013 Conference on Empirical Methods in Natural Language Processing},
	publisher    = {Association for Computational Linguistics},
	address      = {Seattle, Washington, USA},
	pages        = {1631–1642},
	url          = {https://aclanthology.org/D13-1170},
	editor       = {Yarowsky, David and Baldwin, Timothy and Korhonen, Anna and Livescu, Karen and Bethard, Steven}
}
@article{Song_Tan_Qin_Lu_Liu_2019,
	title        = {MASS: Masked Sequence to Sequence Pre-training for Language Generation},
	author       = {Song, Kaitao and Tan, Xu and Qin, Tao and Lu, Jianfeng and Liu, Tie-Yan},
	year         = 2019,
	month        = jun,
	publisher    = {arXiv},
	number       = {arXiv:1905.02450},
	doi          = {10.48550/arXiv.1905.02450},
	url          = {http://arxiv.org/abs/1905.02450},
	note         = {arXiv:1905.02450 [cs]}
}
@article{Su_Lu_Pan_Murtadha_Wen_Liu_2023,
	title        = {RoFormer: Enhanced Transformer with Rotary Position Embedding},
	author       = {Su, Jianlin and Lu, Yu and Pan, Shengfeng and Murtadha, Ahmed and Wen, Bo and Liu, Yunfeng},
	year         = 2023,
	month        = nov,
	publisher    = {arXiv},
	number       = {arXiv:2104.09864},
	url          = {http://arxiv.org/abs/2104.09864},
	note         = {arXiv:2104.09864 [cs]}
}
@article{Su_Vargas_Sakurai_2019,
	title        = {One Pixel Attack for Fooling Deep Neural Networks},
	author       = {Su, Jiawei and Vargas, Danilo Vasconcellos and Sakurai, Kouichi},
	year         = 2019,
	month        = oct,
	journal      = {IEEE Transactions on Evolutionary Computation},
	volume       = 23,
	number       = 5,
	pages        = {828–841},
	doi          = {10.1109/TEVC.2019.2890858},
	issn         = {1089-778X, 1089-778X, 1941-0026},
	rights       = {https://ieeexplore.ieee.org/Xplorehelp/downloads/license-information/IEEE.html}
}
@article{Subramanian_Collobert_Ranzato_Boureau_2020,
	title        = {Multi-scale Transformer Language Models},
	author       = {Subramanian, Sandeep and Collobert, Ronan and Ranzato, Marc’Aurelio and Boureau, Y.-Lan},
	year         = 2020,
	month        = may,
	publisher    = {arXiv},
	number       = {arXiv:2005.00581},
	url          = {http://arxiv.org/abs/2005.00581},
	note         = {arXiv:2005.00581 [cs]}
}
@article{Sukhbaatar_Grave_Bojanowski_Joulin_2019,
	title        = {Adaptive Attention Span in Transformers},
	author       = {Sukhbaatar, Sainbayar and Grave, Edouard and Bojanowski, Piotr and Joulin, Armand},
	year         = 2019,
	month        = aug,
	publisher    = {arXiv},
	number       = {arXiv:1905.07799},
	doi          = {10.48550/arXiv.1905.07799},
	url          = {http://arxiv.org/abs/1905.07799},
	note         = {arXiv:1905.07799 [cs, stat]}
}
@inproceedings{Sundararajan_Taly_Yan_2017,
	title        = {Axiomatic Attribution for Deep Networks},
	author       = {Sundararajan, Mukund and Taly, Ankur and Yan, Qiqi},
	year         = 2017,
	month        = jul,
	booktitle    = {Proceedings of the 34th International Conference on Machine Learning},
	publisher    = {PMLR},
	pages        = {3319–3328},
	issn         = {2640-3498},
	url          = {https://proceedings.mlr.press/v70/sundararajan17a.html},
	language     = {en}
}
@inproceedings{Sutskever_Vinyals_Le_2014,
	title        = {Sequence to Sequence Learning with Neural Networks},
	author       = {Sutskever, Ilya and Vinyals, Oriol and Le, Quoc V},
	year         = 2014,
	booktitle    = {Advances in Neural Information Processing Systems},
	publisher    = {Curran Associates, Inc.},
	volume       = 27,
	url          = {https://proceedings.neurips.cc/paper/2014/hash/a14ac55a4f27472c5d894ec1c3c743d2-Abstract.html}
}
@article{Sutton_Barto,
	title        = {Reinforcement Learning: An Introduction},
	author       = {Sutton, Richard S and Barto, Andrew G},
	language     = {en}
}
@article{Tan_Hooker_Koch_Gordo_Caruana_2023,
	title        = {Considerations When Learning Additive Explanations for Black-Box Models},
	author       = {Tan, Sarah and Hooker, Giles and Koch, Paul and Gordo, Albert and Caruana, Rich},
	year         = 2023,
	month        = sep,
	journal      = {Machine Learning},
	volume       = 112,
	number       = 9,
	pages        = {3333–3359},
	doi          = {10.1007/s10994-023-06335-8},
	issn         = {0885-6125, 1573-0565},
	note         = {arXiv:1801.08640 [cs, stat]}
}
@article{Tay_Bahri_Metzler_Juan_Zhao_Zheng_2021,
	title        = {Synthesizer: Rethinking Self-Attention in Transformer Models},
	author       = {Tay, Yi and Bahri, Dara and Metzler, Donald and Juan, Da-Cheng and Zhao, Zhe and Zheng, Che},
	year         = 2021,
	month        = may,
	publisher    = {arXiv},
	number       = {arXiv:2005.00743},
	url          = {http://arxiv.org/abs/2005.00743},
	note         = {arXiv:2005.00743 [cs]}
}
@inproceedings{Tay_Bahri_Yang_Metzler_Juan_2020,
	title        = {Sparse Sinkhorn Attention},
	author       = {Tay, Yi and Bahri, Dara and Yang, Liu and Metzler, Donald and Juan, Da-Cheng},
	year         = 2020,
	month        = nov,
	booktitle    = {Proceedings of the 37th International Conference on Machine Learning},
	publisher    = {PMLR},
	pages        = {9438–9447},
	issn         = {2640-3498},
	url          = {https://proceedings.mlr.press/v119/tay20a.html},
	language     = {en}
}
@article{Tay_Dehghani_Bahri_Metzler_2022,
	title        = {Efficient Transformers: A Survey},
	author       = {Tay, Yi and Dehghani, Mostafa and Bahri, Dara and Metzler, Donald},
	year         = 2022,
	month        = mar,
	publisher    = {arXiv},
	number       = {arXiv:2009.06732},
	url          = {http://arxiv.org/abs/2009.06732},
	note         = {arXiv:2009.06732 [cs]}
}
@article{Tenney_Das_Pavlick_2019,
	title        = {BERT Rediscovers the Classical NLP Pipeline},
	author       = {Tenney, Ian and Das, Dipanjan and Pavlick, Ellie},
	year         = 2019,
	month        = aug,
	publisher    = {arXiv},
	number       = {arXiv:1905.05950},
	doi          = {10.48550/arXiv.1905.05950},
	url          = {http://arxiv.org/abs/1905.05950},
	note         = {arXiv:1905.05950 [cs]}
}
@article{Tenney_Xia_Chen_Wang_Poliak_McCoy_Kim_Van_Durme_Bowman_Das_etal._2019,
	title        = {What do you learn from context? Probing for sentence structure in contextualized word representations},
	author       = {Tenney, Ian and Xia, Patrick and Chen, Berlin and Wang, Alex and Poliak, Adam and McCoy, R. Thomas and Kim, Najoung and Van Durme, Benjamin and Bowman, Samuel R. and Das, Dipanjan and Pavlick, Ellie},
	year         = 2019,
	month        = may,
	publisher    = {arXiv},
	number       = {arXiv:1905.06316},
	url          = {http://arxiv.org/abs/1905.06316},
	note         = {arXiv:1905.06316 [cs]}
}
@article{Varshney_Alemzadeh_2017,
	title        = {On the Safety of Machine Learning: Cyber-Physical Systems, Decision Sciences, and Data Products},
	author       = {Varshney, Kush R. and Alemzadeh, Homa},
	year         = 2017,
	month        = sep,
	journal      = {Big Data},
	volume       = 5,
	number       = 3,
	pages        = {246–255},
	doi          = {10.1089/big.2016.0051},
	issn         = {2167-6461, 2167-647X},
	language     = {en}
}
@article{Vaswani_Shazeer_Parmar_Uszkoreit_Jones_Gomez_Kaiser_Polosukhin_2023,
	title        = {Attention Is All You Need},
	author       = {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N. and Kaiser, Lukasz and Polosukhin, Illia},
	year         = 2023,
	month        = aug,
	publisher    = {arXiv},
	number       = {arXiv:1706.03762},
	url          = {http://arxiv.org/abs/1706.03762},
	note         = {arXiv:1706.03762 [cs]}
}
@article{Vig_2019,
	title        = {A Multiscale Visualization of Attention in the Transformer Model},
	author       = {Vig, Jesse},
	year         = 2019,
	month        = jun,
	publisher    = {arXiv},
	number       = {arXiv:1906.05714},
	doi          = {10.48550/arXiv.1906.05714},
	url          = {http://arxiv.org/abs/1906.05714},
	note         = {arXiv:1906.05714 [cs]}
}
@inproceedings{deVries_van_Cranenburgh_Nissim_2020,
	title        = {What’s so special about BERT’s layers? A closer look at the NLP pipeline in monolingual and multilingual models},
	author       = {de Vries, Wietse and van Cranenburgh, Andreas and Nissim, Malvina},
	year         = 2020,
	booktitle    = {Findings of the Association for Computational Linguistics: EMNLP 2020},
	pages        = {4339–4350},
	doi          = {10.18653/v1/2020.findings-emnlp.389},
	url          = {http://arxiv.org/abs/2004.06499},
	note         = {arXiv:2004.06499 [cs]}
}
@article{Vyas_Katharopoulos_Fleuret_2020,
	title        = {Fast Transformers with Clustered Attention},
	author       = {Vyas, Apoorv and Katharopoulos, Angelos and Fleuret, François},
	year         = 2020,
	month        = sep,
	publisher    = {arXiv},
	number       = {arXiv:2007.04825},
	url          = {http://arxiv.org/abs/2007.04825},
	note         = {arXiv:2007.04825 [cs, stat]}
}
@article{Wang_Singh_Michael_Hill_Levy_Bowman_2019,
	title        = {GLUE: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding},
	author       = {Wang, Alex and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel R.},
	year         = 2019,
	month        = feb,
	publisher    = {arXiv},
	number       = {arXiv:1804.07461},
	doi          = {10.48550/arXiv.1804.07461},
	url          = {http://arxiv.org/abs/1804.07461},
	note         = {arXiv:1804.07461 [cs]}
}
@article{Wang_Wu_Pino_Baevski_Auli_Conneau_2021,
	title        = {Large-Scale Self- and Semi-Supervised Learning for Speech Translation},
	author       = {Wang, Changhan and Wu, Anne and Pino, Juan and Baevski, Alexei and Auli, Michael and Conneau, Alexis},
	year         = 2021,
	month        = apr,
	publisher    = {arXiv},
	number       = {arXiv:2104.06678},
	url          = {http://arxiv.org/abs/2104.06678},
	note         = {arXiv:2104.06678 [cs]}
}
@article{Wang_Li_Khabsa_Fang_Ma_2020,
	title        = {Linformer: Self-Attention with Linear Complexity},
	author       = {Wang, Sinong and Li, Belinda Z. and Khabsa, Madian and Fang, Han and Ma, Hao},
	year         = 2020,
	month        = jun,
	publisher    = {arXiv},
	number       = {arXiv:2006.04768},
	url          = {http://arxiv.org/abs/2006.04768},
	note         = {arXiv:2006.04768 [cs, stat]}
}
@article{Wang_Jiang_Bach_Wang_Huang_Tu_2020,
	title        = {Structure-Level Knowledge Distillation For Multilingual Sequence Labeling},
	author       = {Wang, Xinyu and Jiang, Yong and Bach, Nguyen and Wang, Tao and Huang, Fei and Tu, Kewei},
	year         = 2020,
	month        = may,
	publisher    = {arXiv},
	number       = {arXiv:2004.03846},
	doi          = {10.48550/arXiv.2004.03846},
	url          = {http://arxiv.org/abs/2004.03846},
	note         = {arXiv:2004.03846 [cs]}
}
@article{Wang_Che_Guo_Liu_Liu_2019,
	title        = {Cross-Lingual BERT Transformation for Zero-Shot Dependency Parsing},
	author       = {Wang, Yuxuan and Che, Wanxiang and Guo, Jiang and Liu, Yijia and Liu, Ting},
	year         = 2019,
	month        = sep,
	publisher    = {arXiv},
	number       = {arXiv:1909.06775},
	doi          = {10.48550/arXiv.1909.06775},
	url          = {http://arxiv.org/abs/1909.06775},
	note         = {arXiv:1909.06775 [cs]}
}
@article{Wang_Ma_Liu_Tang_2019,
	title        = {R-Transformer: Recurrent Neural Network Enhanced Transformer},
	author       = {Wang, Zhiwei and Ma, Yao and Liu, Zitao and Tang, Jiliang},
	year         = 2019,
	month        = jul,
	publisher    = {arXiv},
	number       = {arXiv:1907.05572},
	url          = {http://arxiv.org/abs/1907.05572},
	note         = {arXiv:1907.05572 [cs, eess]}
}
@article{Wang_Xie_Xu_Yang_Neubig_Carbonell_2020,
	title        = {Cross-lingual Alignment vs Joint Training: A Comparative Study and A Simple Unified Framework},
	author       = {Wang, Zirui and Xie, Jiateng and Xu, Ruochen and Yang, Yiming and Neubig, Graham and Carbonell, Jaime},
	year         = 2020,
	month        = feb,
	publisher    = {arXiv},
	number       = {arXiv:1910.04708},
	doi          = {10.48550/arXiv.1910.04708},
	url          = {http://arxiv.org/abs/1910.04708},
	note         = {arXiv:1910.04708 [cs]}
}
@article{Warstadt_Singh_Bowman_2019,
	title        = {Neural Network Acceptability Judgments},
	author       = {Warstadt, Alex and Singh, Amanpreet and Bowman, Samuel R.},
	year         = 2019,
	month        = oct,
	publisher    = {arXiv},
	number       = {arXiv:1805.12471},
	url          = {http://arxiv.org/abs/1805.12471},
	note         = {arXiv:1805.12471 [cs]}
}
@article{Watts_Strogatz_1998,
	title        = {Collective dynamics of ‘small-world’ networks},
	author       = {Watts, Duncan J. and Strogatz, Steven H.},
	year         = 1998,
	month        = jun,
	journal      = {Nature},
	volume       = 393,
	number       = 6684,
	pages        = {440–442},
	doi          = {10.1038/30918},
	issn         = {0028-0836, 1476-4687},
	rights       = {http://www.springer.com/tdm},
	language     = {en}
}
@article{Wei_Weng_Hu_Xing_Yu_Luo_2021,
	title        = {On Learning Universal Representations Across Languages},
	author       = {Wei, Xiangpeng and Weng, Rongxiang and Hu, Yue and Xing, Luxi and Yu, Heng and Luo, Weihua},
	year         = 2021,
	month        = mar,
	publisher    = {arXiv},
	number       = {arXiv:2007.15960},
	doi          = {10.48550/arXiv.2007.15960},
	url          = {http://arxiv.org/abs/2007.15960},
	note         = {arXiv:2007.15960 [cs]}
}
@inproceedings{Whittlestone_Nyrup_Alexandrova_Cave_2019,
	title        = {The Role and Limits of Principles in AI Ethics: Towards a Focus on Tensions},
	author       = {Whittlestone, Jess and Nyrup, Rune and Alexandrova, Anna and Cave, Stephen},
	year         = 2019,
	booktitle    = {Proceedings of the 2019 AAAI/ACM Conference on AI, Ethics, and Society},
	publisher    = {Association for Computing Machinery},
	address      = {New York, NY, USA},
	series       = {AIES ’19},
	pages        = {195–200},
	doi          = {10.1145/3306618.3314289},
	isbn         = {978-1-4503-6324-2},
	url          = {https://dl.acm.org/doi/10.1145/3306618.3314289},
	collection   = {AIES ’19}
}
@article{Wiegreffe_Pinter_2019,
	title        = {Attention is not not Explanation},
	author       = {Wiegreffe, Sarah and Pinter, Yuval},
	year         = 2019,
	month        = sep,
	publisher    = {arXiv},
	number       = {arXiv:1908.04626},
	url          = {http://arxiv.org/abs/1908.04626},
	note         = {arXiv:1908.04626 [cs]}
}
@article{Wu_Wu_Qi_Huang_2021,
	title        = {Hi-Transformer: Hierarchical Interactive Transformer for Efficient and Effective Long Document Modeling},
	author       = {Wu, Chuhan and Wu, Fangzhao and Qi, Tao and Huang, Yongfeng},
	year         = 2021,
	month        = dec,
	publisher    = {arXiv},
	number       = {arXiv:2106.01040},
	url          = {http://arxiv.org/abs/2106.01040},
	note         = {arXiv:2106.01040 [cs]}
}
@article{Wu_Lan_Qian_Gu_Geramifard_Yu_2022,
	title        = {Memformer: A Memory-Augmented Transformer for Sequence Modeling},
	author       = {Wu, Qingyang and Lan, Zhenzhong and Qian, Kun and Gu, Jing and Geramifard, Alborz and Yu, Zhou},
	year         = 2022,
	month        = apr,
	publisher    = {arXiv},
	number       = {arXiv:2010.06891},
	url          = {http://arxiv.org/abs/2010.06891},
	note         = {arXiv:2010.06891 [cs]}
}
@article{Wu_Dredze_2019,
	title        = {Beto, Bentz, Becas: The Surprising Cross-Lingual Effectiveness of BERT},
	author       = {Wu, Shijie and Dredze, Mark},
	year         = 2019,
	month        = oct,
	publisher    = {arXiv},
	number       = {arXiv:1904.09077},
	doi          = {10.48550/arXiv.1904.09077},
	url          = {http://arxiv.org/abs/1904.09077},
	note         = {arXiv:1904.09077 [cs]}
}
@article{Wu_Liu_Lin_Lin_Han_2020,
	title        = {Lite Transformer with Long-Short Range Attention},
	author       = {Wu, Zhanghao and Liu, Zhijian and Lin, Ji and Lin, Yujun and Han, Song},
	year         = 2020,
	month        = apr,
	publisher    = {arXiv},
	number       = {arXiv:2004.11886},
	url          = {http://arxiv.org/abs/2004.11886},
	note         = {arXiv:2004.11886 [cs]}
}
@article{Xin_Tang_Lee_Yu_Lin_2020,
	title        = {DeeBERT: Dynamic Early Exiting for Accelerating BERT Inference},
	author       = {Xin, Ji and Tang, Raphael and Lee, Jaejun and Yu, Yaoliang and Lin, Jimmy},
	year         = 2020,
	month        = apr,
	publisher    = {arXiv},
	number       = {arXiv:2004.12993},
	doi          = {10.48550/arXiv.2004.12993},
	url          = {http://arxiv.org/abs/2004.12993},
	note         = {arXiv:2004.12993 [cs]}
}
@article{Xiong_Zeng_Chakraborty_Tan_Fung_Li_Singh_2021,
	title        = {Nyströmformer: A Nyström-based Algorithm for Approximating Self-Attention},
	author       = {Xiong, Yunyang and Zeng, Zhanpeng and Chakraborty, Rudrasis and Tan, Mingxing and Fung, Glenn and Li, Yin and Singh, Vikas},
	year         = 2021,
	month        = may,
	journal      = {Proceedings of the AAAI Conference on Artificial Intelligence},
	volume       = 35,
	number       = 16,
	pages        = {14138–14148},
	doi          = {10.1609/aaai.v35i16.17664},
	issn         = {2374-3468, 2159-5399}
}
@inproceedings{Xu_Ba_Kiros_Cho_Courville_Salakhudinov_Zemel_Bengio_2015,
	title        = {Show, Attend and Tell: Neural Image Caption Generation with Visual Attention},
	author       = {Xu, Kelvin and Ba, Jimmy and Kiros, Ryan and Cho, Kyunghyun and Courville, Aaron and Salakhudinov, Ruslan and Zemel, Rich and Bengio, Yoshua},
	year         = 2015,
	month        = jun,
	booktitle    = {Proceedings of the 32nd International Conference on Machine Learning},
	publisher    = {PMLR},
	pages        = {2048–2057},
	issn         = {1938-7228},
	url          = {https://proceedings.mlr.press/v37/xuc15.html},
	language     = {en}
}
@inbook{Xu_Duan_Cai_Chia_Xu_Tian_2004,
	title        = {HMM-Based Audio Keyword Generation},
	author       = {Xu, Min and Duan, Ling-Yu and Cai, Jianfei and Chia, Liang-Tien and Xu, Changsheng and Tian, Qi},
	year         = 2004,
	booktitle    = {Advances in Multimedia Information Processing - PCM 2004},
	publisher    = {Springer Berlin Heidelberg},
	address      = {Berlin, Heidelberg},
	series       = {Lecture Notes in Computer Science},
	volume       = 3333,
	pages        = {566–574},
	doi          = {10.1007/978-3-540-30543-9_71},
	isbn         = {978-3-540-23985-7},
	url          = {http://link.springer.com/10.1007/978-3-540-30543-9_71},
	editor       = {Aizawa, Kiyoharu and Nakamura, Yuichi and Satoh, Shin’ichi},
	collection   = {Lecture Notes in Computer Science},
	language     = {en}
}
@article{Xue_Barua_Constant_Al-Rfou_Narang_Kale_Roberts_Raffel_2022,
	title        = {ByT5: Towards a token-free future with pre-trained byte-to-byte models},
	author       = {Xue, Linting and Barua, Aditya and Constant, Noah and Al-Rfou, Rami and Narang, Sharan and Kale, Mihir and Roberts, Adam and Raffel, Colin},
	year         = 2022,
	month        = mar,
	publisher    = {arXiv},
	number       = {arXiv:2105.13626},
	url          = {http://arxiv.org/abs/2105.13626},
	note         = {arXiv:2105.13626 [cs]}
}
@article{Xue_Constant_Roberts_Kale_Al-Rfou_Siddhant_Barua_Raffel_2021,
	title        = {mT5: A massively multilingual pre-trained text-to-text transformer},
	author       = {Xue, Linting and Constant, Noah and Roberts, Adam and Kale, Mihir and Al-Rfou, Rami and Siddhant, Aditya and Barua, Aditya and Raffel, Colin},
	year         = 2021,
	month        = mar,
	publisher    = {arXiv},
	number       = {arXiv:2010.11934},
	doi          = {10.48550/arXiv.2010.11934},
	url          = {http://arxiv.org/abs/2010.11934},
	note         = {arXiv:2010.11934 [cs]}
}
@article{Y_1985,
	title        = {Une procedure d’apprentissage ponr reseau a seuil asymetrique},
	author       = {Y, Lecun},
	year         = 1985,
	journal      = {Proceedings of Cognitiva 85},
	pages        = {599–604}
}
@article{Yang_Ma_Zhang_Wu_Li_Zhou_2020,
	title        = {Alternating Language Modeling for Cross-Lingual Pre-Training},
	author       = {Yang, Jian and Ma, Shuming and Zhang, Dongdong and Wu, ShuangZhi and Li, Zhoujun and Zhou, Ming},
	year         = 2020,
	month        = apr,
	journal      = {Proceedings of the AAAI Conference on Artificial Intelligence},
	volume       = 34,
	number       = {05},
	pages        = {9386–9393},
	doi          = {10.1609/aaai.v34i05.6480},
	issn         = {2374-3468, 2159-5399},
	rights       = {https://www.aaai.org}
}
@article{Yang_Abrego_Yuan_Guo_Shen_Cer_Sung_Strope_Kurzweil_2019,
	title        = {Improving Multilingual Sentence Embedding using Bi-directional Dual Encoder with Additive Margin Softmax},
	author       = {Yang, Yinfei and Abrego, Gustavo Hernandez and Yuan, Steve and Guo, Mandy and Shen, Qinlan and Cer, Daniel and Sung, Yun-hsuan and Strope, Brian and Kurzweil, Ray},
	year         = 2019,
	month        = jun,
	publisher    = {arXiv},
	number       = {arXiv:1902.08564},
	url          = {http://arxiv.org/abs/1902.08564},
	note         = {arXiv:1902.08564 [cs]}
}
@article{Yang_Cer_Ahmad_Guo_Law_Constant_Abrego_Yuan_Tar_Sung_etal._2019,
	title        = {Multilingual Universal Sentence Encoder for Semantic Retrieval},
	author       = {Yang, Yinfei and Cer, Daniel and Ahmad, Amin and Guo, Mandy and Law, Jax and Constant, Noah and Abrego, Gustavo Hernandez and Yuan, Steve and Tar, Chris and Sung, Yun-Hsuan and Strope, Brian and Kurzweil, Ray},
	year         = 2019,
	month        = jul,
	publisher    = {arXiv},
	number       = {arXiv:1907.04307},
	doi          = {10.48550/arXiv.1907.04307},
	url          = {http://arxiv.org/abs/1907.04307},
	note         = {arXiv:1907.04307 [cs]}
}
@article{Yang_Zhang_Tar_Baldridge_2019,
	title        = {PAWS-X: A Cross-lingual Adversarial Dataset for Paraphrase Identification},
	author       = {Yang, Yinfei and Zhang, Yuan and Tar, Chris and Baldridge, Jason},
	year         = 2019,
	month        = aug,
	publisher    = {arXiv},
	number       = {arXiv:1908.11828},
	doi          = {10.48550/arXiv.1908.11828},
	url          = {http://arxiv.org/abs/1908.11828},
	note         = {arXiv:1908.11828 [cs]}
}
@inproceedings{Yang_Yang_Dyer_He_Smola_Hovy_2016,
	title        = {Hierarchical Attention Networks for Document Classification},
	author       = {Yang, Zichao and Yang, Diyi and Dyer, Chris and He, Xiaodong and Smola, Alex and Hovy, Eduard},
	year         = 2016,
	booktitle    = {Proceedings of the 2016 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},
	publisher    = {Association for Computational Linguistics},
	address      = {San Diego, California},
	pages        = {1480–1489},
	doi          = {10.18653/v1/N16-1174},
	url          = {http://aclweb.org/anthology/N16-1174},
	language     = {en}
}
@inproceedings{Yim_Joo_Bae_Kim_2017,
	title        = {A Gift From Knowledge Distillation: Fast Optimization, Network Minimization and Transfer Learning},
	author       = {Yim, Junho and Joo, Donggyu and Bae, Jihoon and Kim, Junmo},
	year         = 2017,
	pages        = {4133–4141},
	url          = {https://openaccess.thecvf.com/content_cvpr_2017/html/Yim_A_Gift_From_CVPR_2017_paper.html}
}
@article{Yin_Neubig_Yih_Riedel_2020,
	title        = {TaBERT: Pretraining for Joint Understanding of Textual and Tabular Data},
	author       = {Yin, Pengcheng and Neubig, Graham and Yih, Wen-tau and Riedel, Sebastian},
	year         = 2020,
	month        = may,
	publisher    = {arXiv},
	number       = {arXiv:2005.08314},
	doi          = {10.48550/arXiv.2005.08314},
	url          = {http://arxiv.org/abs/2005.08314},
	note         = {arXiv:2005.08314 [cs]}
}
@article{Yuan_He_Zhu_Li_2019,
	title        = {Adversarial Examples: Attacks and Defenses for Deep Learning},
	author       = {Yuan, Xiaoyong and He, Pan and Zhu, Qile and Li, Xiaolin},
	year         = 2019,
	month        = sep,
	journal      = {IEEE Transactions on Neural Networks and Learning Systems},
	volume       = 30,
	number       = 9,
	pages        = {2805–2824},
	doi          = {10.1109/TNNLS.2018.2886017},
	issn         = {2162-237X, 2162-2388},
	rights       = {https://ieeexplore.ieee.org/Xplorehelp/downloads/license-information/IEEE.html}
}
@article{Zaheer_Guruganesh_Dubey_Ainslie_Alberti_Ontanon_Pham_Ravula_Wang_Yang_etal._2021,
	title        = {Big Bird: Transformers for Longer Sequences},
	author       = {Zaheer, Manzil and Guruganesh, Guru and Dubey, Avinava and Ainslie, Joshua and Alberti, Chris and Ontanon, Santiago and Pham, Philip and Ravula, Anirudh and Wang, Qifan and Yang, Li and Ahmed, Amr},
	year         = 2021,
	month        = jan,
	publisher    = {arXiv},
	number       = {arXiv:2007.14062},
	url          = {http://arxiv.org/abs/2007.14062},
	note         = {arXiv:2007.14062 [cs, stat]}
}
@inproceedings{Zellers_Bisk_Farhadi_Choi_2019,
	title        = {From Recognition to Cognition: Visual Commonsense Reasoning},
	author       = {Zellers, Rowan and Bisk, Yonatan and Farhadi, Ali and Choi, Yejin},
	year         = 2019,
	pages        = {6720–6731},
	url          = {https://openaccess.thecvf.com/content_CVPR_2019/html/Zellers_From_Recognition_to_Cognition_Visual_Commonsense_Reasoning_CVPR_2019_paper.html}
}
@article{Zhang_Cao_Wu_Zhu_2017,
	title        = {Growing Interpretable Part Graphs on ConvNets via Multi-Shot Learning},
	author       = {Zhang, Quanshi and Cao, Ruiming and Wu, Ying Nian and Zhu, Song-Chun},
	year         = 2017,
	month        = feb,
	journal      = {Proceedings of the AAAI Conference on Artificial Intelligence},
	volume       = 31,
	number       = 1,
	doi          = {10.1609/aaai.v31i1.10924},
	issn         = {2374-3468, 2159-5399},
	url          = {https://ojs.aaai.org/index.php/AAAI/article/view/10924}
}
@article{Zhang_Sheng_Alhazmi_Li_2019,
	title        = {Adversarial Attacks on Deep Learning Models in Natural Language Processing: A Survey},
	author       = {Zhang, Wei Emma and Sheng, Quan Z. and Alhazmi, Ahoud and Li, Chenliang},
	year         = 2019,
	month        = apr,
	publisher    = {arXiv},
	number       = {arXiv:1901.06796},
	url          = {http://arxiv.org/abs/1901.06796},
	note         = {arXiv:1901.06796 [cs]}
}
@article{Zhang_Lu_Li_Peng_Zhang_2019,
	title        = {Deep feature fusion model for sentence semantic matching},
	author       = {Zhang, X. and Lu, W. and Li, F. and Peng, X. and Zhang, R.},
	year         = 2019,
	month        = jan,
	issn         = {1546-2218},
	url          = {https://opus.lib.uts.edu.au/handle/10453/136542}
}
@article{Zhang_Zhao_LeCun_2016,
	title        = {Character-level Convolutional Networks for Text Classification},
	author       = {Zhang, Xiang and Zhao, Junbo and LeCun, Yann},
	year         = 2016,
	month        = apr,
	publisher    = {arXiv},
	number       = {arXiv:1509.01626},
	url          = {http://arxiv.org/abs/1509.01626},
	note         = {arXiv:1509.01626 [cs]}
}
@article{Zhang_Wei_Zhou_2019,
	title        = {HIBERT: Document Level Pre-training of Hierarchical Bidirectional Transformers for Document Summarization},
	author       = {Zhang, Xingxing and Wei, Furu and Zhou, Ming},
	year         = 2019,
	month        = may,
	publisher    = {arXiv},
	number       = {arXiv:1905.06566},
	doi          = {10.48550/arXiv.1905.06566},
	url          = {http://arxiv.org/abs/1905.06566},
	note         = {arXiv:1905.06566 [cs]}
}
@article{Zhao_Eger_Bjerva_Augenstein_2021,
	title        = {Inducing Language-Agnostic Multilingual Representations},
	author       = {Zhao, Wei and Eger, Steffen and Bjerva, Johannes and Augenstein, Isabelle},
	year         = 2021,
	month        = jun,
	publisher    = {arXiv},
	number       = {arXiv:2008.09112},
	url          = {http://arxiv.org/abs/2008.09112},
	note         = {arXiv:2008.09112 [cs]}
}
@article{Zhao_Dong_Shen_Zhang_Wei_Chen_2021,
	title        = {Memory-Efficient Differentiable Transformer Architecture Search},
	author       = {Zhao, Yuekai and Dong, Li and Shen, Yelong and Zhang, Zhihua and Wei, Furu and Chen, Weizhu},
	year         = 2021,
	month        = may,
	publisher    = {arXiv},
	number       = {arXiv:2105.14669},
	doi          = {10.48550/arXiv.2105.14669},
	url          = {http://arxiv.org/abs/2105.14669},
	note         = {arXiv:2105.14669 [cs]}
}
@inproceedings{Zheng_Chen_Lu_Zhou_2019,
	title        = {Hardness-Aware Deep Metric Learning},
	author       = {Zheng, Wenzhao and Chen, Zhaodong and Lu, Jiwen and Zhou, Jie},
	year         = 2019,
	pages        = {72–81},
	url          = {https://openaccess.thecvf.com/content_CVPR_2019/html/Zheng_Hardness-Aware_Deep_Metric_Learning_CVPR_2019_paper.html}
}
@article{Zhou_Khosla_Lapedriza_Oliva_Torralba_2015,
	title        = {Object Detectors Emerge in Deep Scene CNNs},
	author       = {Zhou, Bolei and Khosla, Aditya and Lapedriza, Agata and Oliva, Aude and Torralba, Antonio},
	year         = 2015,
	month        = apr,
	publisher    = {arXiv},
	number       = {arXiv:1412.6856},
	url          = {http://arxiv.org/abs/1412.6856},
	note         = {arXiv:1412.6856 [cs]}
}
@article{Zhou_Zhang_Peng_Zhang_Li_Xiong_Zhang_2021,
	title        = {Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting},
	author       = {Zhou, Haoyi and Zhang, Shanghang and Peng, Jieqi and Zhang, Shuai and Li, Jianxin and Xiong, Hui and Zhang, Wancai},
	year         = 2021,
	month        = may,
	journal      = {Proceedings of the AAAI Conference on Artificial Intelligence},
	volume       = 35,
	number       = 12,
	pages        = {11106–11115},
	doi          = {10.1609/aaai.v35i12.17325},
	issn         = {2374-3468, 2159-5399}
}
@inproceedings{Zhu_Kiros_Zemel_Salakhutdinov_Urtasun_Torralba_Fidler_2015,
	title        = {Aligning Books and Movies: Towards Story-Like Visual Explanations by Watching Movies and Reading Books},
	author       = {Zhu, Yukun and Kiros, Ryan and Zemel, Rich and Salakhutdinov, Ruslan and Urtasun, Raquel and Torralba, Antonio and Fidler, Sanja},
	year         = 2015,
	pages        = {19–27},
	url          = {https://www.cv-foundation.org/openaccess/content_iccv_2015/html/Zhu_Aligning_Books_and_ICCV_2015_paper.html}
}
@article{Zintgraf_Cohen_Adel_Welling_2017,
	title        = {Visualizing Deep Neural Network Decisions: Prediction Difference Analysis},
	author       = {Zintgraf, Luisa M. and Cohen, Taco S. and Adel, Tameem and Welling, Max},
	year         = 2017,
	month        = feb,
	publisher    = {arXiv},
	number       = {arXiv:1702.04595},
	url          = {http://arxiv.org/abs/1702.04595},
	note         = {arXiv:1702.04595 [cs]}
}
@inproceedings{Zweigenbaum_Sharoff_Rapp_2017,
	title        = {Overview of the Second BUCC Shared Task: Spotting Parallel Sentences in Comparable Corpora},
	author       = {Zweigenbaum, Pierre and Sharoff, Serge and Rapp, Reinhard},
	year         = 2017,
	month        = aug,
	booktitle    = {Proceedings of the 10th Workshop on Building and Using Comparable Corpora},
	publisher    = {Association for Computational Linguistics},
	address      = {Vancouver, Canada},
	pages        = {60–67},
	doi          = {10.18653/v1/W17-2512},
	url          = {https://aclanthology.org/W17-2512},
	editor       = {Sharoff, Serge and Zweigenbaum, Pierre and Rapp, Reinhard}
}
@misc{Explaining_Classifications_For_Individual_Instances_IEEE_Journals_Magazine_IEEE_Xplore,
	url          = {https://ieeexplore.ieee.org/abstract/document/4407709}
}
@article{Wu_Schuster_Chen_Le_Norouzi_Macherey_Krikun_Cao_Gao_Macherey_etal._2016,
	title        = {Google’s Neural Machine Translation System: Bridging the Gap between Human and Machine Translation},
	author       = {Wu, Yonghui and Schuster, Mike and Chen, Zhifeng and Le, Quoc V. and Norouzi, Mohammad and Macherey, Wolfgang and Krikun, Maxim and Cao, Yuan and Gao, Qin and Macherey, Klaus and Klingner, Jeff and Shah, Apurva and Johnson, Melvin and Liu, Xiaobing and Kaiser, Łukasz and Gouws, Stephan and Kato, Yoshikiyo and Kudo, Taku and Kazawa, Hideto and Stevens, Keith and Kurian, George and Patil, Nishant and Wang, Wei and Young, Cliff and Smith, Jason and Riesa, Jason and Rudnick, Alex and Vinyals, Oriol and Corrado, Greg and Hughes, Macduff and Dean, Jeffrey},
	year         = 2016,
	month        = oct,
	publisher    = {arXiv},
	number       = {arXiv:1609.08144},
	url          = {http://arxiv.org/abs/1609.08144},
	note         = {arXiv:1609.08144 [cs]}
}
@article{Wu_Schuster_Chen_Le_Norouzi_Macherey_Krikun_Cao_Gao_Macherey_etal._2016,
	title        = {Google’s Neural Machine Translation System: Bridging the Gap between Human and Machine Translation},
	author       = {Wu, Yonghui and Schuster, Mike and Chen, Zhifeng and Le, Quoc V. and Norouzi, Mohammad and Macherey, Wolfgang and Krikun, Maxim and Cao, Yuan and Gao, Qin and Macherey, Klaus and Klingner, Jeff and Shah, Apurva and Johnson, Melvin and Liu, Xiaobing and Kaiser, Łukasz and Gouws, Stephan and Kato, Yoshikiyo and Kudo, Taku and Kazawa, Hideto and Stevens, Keith and Kurian, George and Patil, Nishant and Wang, Wei and Young, Cliff and Smith, Jason and Riesa, Jason and Rudnick, Alex and Vinyals, Oriol and Corrado, Greg and Hughes, Macduff and Dean, Jeffrey},
	year         = 2016,
	month        = oct,
	publisher    = {arXiv},
	number       = {arXiv:1609.08144},
	url          = {http://arxiv.org/abs/1609.08144},
	note         = {arXiv:1609.08144 [cs]}
}
@phdthesis{linnainmaa1970representation,
  title={The representation of the cumulative rounding error of an algorithm as a Taylor expansion of the local rounding errors},
  author={Linnainmaa, Seppo},
  year={1970},
  school={Master’s Thesis (in Finnish), Univ. Helsinki}
}
@article{werbos1974beyond,
  title={Beyond regression: New tools for prediction and analysis in the behavioral sciences},
  author={Werbos, Paul},
  journal={PhD thesis, Committee on Applied Mathematics, Harvard University, Cambridge, MA},
  year={1974}
}
@article{fukushima1979neural,
  title={Neural network model for a mechanism of pattern recognition unaffected by shift in position-neocognitron},
  author={Fukushima, Kunihiko},
  journal={IEICE Technical Report, A},
  volume={62},
  number={10},
  pages={658--665},
  year={1979}
}
@article{parker1985learning,
  title={Learning-logic},
  author={Parker, David B},
  journal={Tech. Rep.},
  volume={47},
  year={1985},
  publisher={Sloan School of Management, MIT}
}
@article{lecun1985procedure,
  title={Une procedure d'apprentissage ponr reseau a seuil asymetrique},
  author={LeCun, Yann},
  journal={Proceedings of Cognitiva 85},
  pages={599--604},
  year={1985}
}
 @book{DeeplearningforNLPandspeechrecognition_2019,
 title={Deep learning for NLP and speech recognition_2019},
 address={New York, NY},
 ISBN={978-3-030-14595-8},
 publisher={Springer Science+Business Media},
 year={2019}
}
@article{rumelhart1986learning,
  title={Learning representations by back-propagating errors},
  author={Rumelhart, David E and Hinton, Geoffrey E and Williams, Ronald J},
  journal={nature},
  volume={323},
  number={6088},
  pages={533--536},
  year={1986},
  publisher={Nature Publishing Group UK London}
}
@book{sutskever2013training,
  title={Training recurrent neural networks},
  author={Sutskever, Ilya},
  year={2013},
  publisher={University of Toronto Toronto, ON, Canada}
}
@article{james1983principles,
  title={The principles of psychology, the works of william james},
  author={James, William and Burkhardt, Frederick H},
  year={1983}
}
@article{chen1998evaluation,
  title={Evaluation metrics for language models},
  author={Chen, Stanley F and Beeferman, Douglas and Rosenfeld, Roni},
  year={1998},
  publisher={Carnegie Mellon University}
}
@article{grootendorst2020bertopic,
  title={BERTopic: Leveraging BERT and c-TF-IDF to create easily interpretable topics},
  author={Grootendorst, Maarten},
  journal={Zenodo, Version v0},
  volume={9},
  number={10.5281},
  year={2020}
}
@misc{valkov2020sentiment,
  title={Sentiment analysis with bert and transformers by hugging face using pytorch and python},
  author={Valkov, Venelin},
  year={2020},
  publisher={Curiousily}
}
@article{conneau2020unsupervised,
  title={Unsupervised Cross-lingual Representation Learning at Scale. In< i> Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics</i>, 8440-8451},
  author={Conneau, A and Khandelwal, K and Goyal, N and Chaudhary, V and Wenzek, G and Guzm{\'a}n, F and Grave, E and Ott, M and Zettlemoyer, L and Stoyanov, V},
  journal={Online: Association for Computational Linguistics},
  year={2020}
}
@article{fuchs2018neural,
  title={Neural Stethoscopes: Unifying analytic, auxiliary and adversarial network probing},
  author={Fuchs, F and Groth, Oliver and Kosiorek, A and Bewley, Alex and Wulfmeier, Markus and Vedaldi, Andrea and Posner, H},
  journal={arXiv},
  year={2018}
}
@article{zhang2019generating,
  title={Generating textual adversarial examples for deep learning models: A survey},
  author={Zhang, Wei Emma and Sheng, Quan Z and Alhazmi, Ahoud Abdulrahmn F and Li, Chenliang},
  journal={arXiv preprint arXiv:1901.06796},
  pages={129},
  year={2019}
}
@inproceedings{moosavi2017universal,
  title={Universal adversarial perturbations},
  author={Moosavi-Dezfooli, Seyed-Mohsen and Fawzi, Alhussein and Fawzi, Omar and Frossard, Pascal},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={1765--1773},
  year={2017}
}
@article{Peters_Neumann_Logan_IV_Schwartz_Joshi_Singh_Smith_2019,
  title={Knowledge Enhanced Contextual Word Representations},
  url={http://arxiv.org/abs/1909.04164},
  note={arXiv:1909.04164 [cs]},
  number={arXiv:1909.04164},
  publisher={arXiv},
  author={Peters, Matthew E. and Neumann, Mark and Logan IV, Robert L. and Schwartz, Roy and Joshi, Vidur and Singh, Sameer and Smith, Noah A.},
  year={2019},
  month=oct
}
@inproceedings{he2017unsupervised,
  title={An unsupervised neural attention model for aspect extraction},
  author={He, Ruidan and Lee, Wee Sun and Ng, Hwee Tou and Dahlmeier, Daniel},
  booktitle={Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  pages={388--397},
  year={2017}
}
@book{coeckelbergh2020ai,
  title={AI ethics},
  author={Coeckelbergh, Mark},
  year={2020},
  publisher={Mit Press}
}
@article{Erdős_Rényi,
  title={ON THE EVOLUTION OF RANDOM GRAPHS by},
  author={Erdős, P and Rényi, A},
  language={en}
}
@article{gray2017gpu,
  title={Gpu kernels for block-sparse weights},
  author={Gray, Scott and Radford, Alec and Kingma, Diederik P},
  journal={arXiv preprint arXiv:1711.09224},
  volume={3},
  number={2},
  pages={2},
  year={2017}
}
@article{robnik2008explaining,
  title={Explaining classifications for individual instances},
  author={Robnik-{\v{S}}ikonja, Marko and Kononenko, Igor},
  journal={IEEE Transactions on Knowledge and Data Engineering},
  volume={20},
  number={5},
  pages={589--600},
  year={2008},
  publisher={IEEE}
}
@article{tan2018learning,
  title={Learning global additive explanations for neural nets using model distillation},
  author={Tan, Sarah and Caruana, Rich and Hooker, Giles and Koch, Paul and Gordo, Albert},
  year={2018}
}